[llvm] [AMDGPU][True16][CodeGen] build_vector pattern in true16 (PR #118904)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 3 15:23:32 PST 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/118904
>From 0c85389e652e1a3490811195f462629a251db89d Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 5 Dec 2024 17:03:38 -0500
Subject: [PATCH 1/2] buildvector pattern in True16
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 15 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 20 +
llvm/test/CodeGen/AMDGPU/bf16.ll | 7412 ++++++++---------
llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll | 138 +-
4 files changed, 3790 insertions(+), 3795 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3bbbbcf71d8aec..4b265450d38b51 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -782,9 +782,22 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
return true;
// TODO: This should probably be a combine somewhere
- // (build_vector $src0, undef) -> copy $src0
MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
+ if (Subtarget->useRealTrue16Insts() && IsVector) {
+ // (vecTy (DivergentBinFrag<build_vector> Ty:$src0, (Ty undef))),
+ // -> (vecTy (INSERT_SUBREG (IMPLICIT_DEF), VGPR_16:$src0, lo16))
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
+ BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::INSERT_SUBREG), Dst)
+ .addReg(Undef)
+ .addReg(Src0)
+ .addImm(AMDGPU::lo16);
+ MI.eraseFromParent();
+ return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) &&
+ RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_16RegClass, *MRI);
+ }
+ // (build_vector $src0, undef) -> copy $src0
MI.setDesc(TII.get(AMDGPU::COPY));
MI.removeOperand(2);
const auto &RC =
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bee4c47a23ba6b..50b9f895b3b476 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3346,6 +3346,8 @@ def : GCNPat <
(COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$src0), (Ty undef))),
(COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
@@ -3355,6 +3357,7 @@ def : GCNPat <
(vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))),
(S_LSHL_B32 SReg_32:$src1, (i32 16))
>;
+}
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))),
@@ -3364,6 +3367,8 @@ def : GCNPat <
}
let SubtargetPredicate = HasVOP3PInsts in {
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
def : GCNPat <
(v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 VGPR_32:$src1))),
(v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
@@ -3393,12 +3398,25 @@ def : GCNPat <
(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
// Take the lower 16 bits from each VGPR_32 and concat them
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
>;
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+ (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
+ (REG_SEQUENCE VGPR_32, VGPR_16:$a, lo16, VGPR_16:$b, hi16)
+>;
+// GISel ignores this Pat, but the equivalent is done in selectG_BUILD_VECTOR
+def : GCNPat <
+ (vecTy (build_vector (Ty VGPR_16:$src0), (Ty undef))),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16)
+>;
+}
// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
@@ -3424,6 +3442,8 @@ def : GCNPat <
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
// Special case, can use V_ALIGNBIT (always uses encoded literal)
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector>
(Ty !if(!eq(Ty, i16),
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 0382cc72a36ae2..230a62f447938c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -662,12 +662,14 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0
; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x6c, v0
; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x68, v0
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0
+; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
-; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0
-; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0
; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0
+; GCN-NEXT: v_add_i32_e32 v16, vcc, 0x58, v0
+; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
@@ -675,9 +677,9 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
-; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0
-; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0
-; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v12, s[0:3], 0 offen
@@ -685,63 +687,60 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
-; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0
-; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x48, v0
-; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x44, v0
+; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0
+; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v6, v15, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v5, v16, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v4, v17, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
-; GCN-NEXT: v_add_i32_e32 v7, vcc, 64, v0
-; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0
; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0
+; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0
+; GCN-NEXT: v_add_i32_e32 v22, vcc, 48, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v4, v12, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v4, v10, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v3, v11, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
-; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:48
-; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0
+; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v1, vcc, 48, v0
-; GCN-NEXT: buffer_store_dword v9, v20, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0
-; GCN-NEXT: buffer_store_dword v8, v21, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(3)
+; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 44, v0
+; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 40, v0
+; GCN-NEXT: buffer_store_dword v4, v21, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v8, vcc, 40, v0
-; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v1, vcc, 36, v0
+; GCN-NEXT: v_add_i32_e32 v4, vcc, 36, v0
+; GCN-NEXT: buffer_store_dword v3, v22, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0
-; GCN-NEXT: v_add_i32_e32 v9, vcc, 28, v0
-; GCN-NEXT: v_add_i32_e32 v10, vcc, 24, v0
+; GCN-NEXT: v_add_i32_e32 v3, vcc, 32, v0
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0
+; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0
; GCN-NEXT: v_add_i32_e32 v19, vcc, 20, v0
-; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 16, v0
-; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 12, v0
-; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0
-; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0
+; GCN-NEXT: v_add_i32_e32 v20, vcc, 16, v0
+; GCN-NEXT: s_waitcnt vmcnt(6)
+; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 12, v0
+; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0
+; GCN-NEXT: buffer_store_dword v8, v4, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(8)
-; GCN-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v17, v10, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v17, v6, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -759,6 +758,14 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0
; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0
; GFX7-NEXT: v_add_i32_e32 v19, vcc, 52, v0
+; GFX7-NEXT: v_add_i32_e32 v20, vcc, 48, v0
+; GFX7-NEXT: v_add_i32_e32 v21, vcc, 44, v0
+; GFX7-NEXT: v_add_i32_e32 v22, vcc, 40, v0
+; GFX7-NEXT: v_add_i32_e32 v23, vcc, 36, v0
+; GFX7-NEXT: v_add_i32_e32 v24, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v25, vcc, 28, v0
+; GFX7-NEXT: v_add_i32_e32 v26, vcc, 24, v0
+; GFX7-NEXT: v_add_i32_e32 v27, vcc, 20, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
@@ -802,34 +809,26 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 48, v0
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 16, v0
; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 12, v0
; GFX7-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 40, v0
-; GFX7-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 36, v0
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 28, v0
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 24, v0
-; GFX7-NEXT: v_add_i32_e32 v19, vcc, 20, v0
-; GFX7-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0
-; GFX7-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 12, v0
-; GFX7-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0
-; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 8, v0
+; GFX7-NEXT: buffer_store_dword v3, v20, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: buffer_store_dword v10, v21, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v9, v22, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v8, v23, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v7, v24, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v13, v6, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v14, v25, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v13, v26, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v12, v27, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(12)
-; GFX7-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -1336,83 +1335,83 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-LABEL: v_store_global_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16
-; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v5
; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16
-; GCN-NEXT: v_alignbit_b32 v4, v16, v4, 16
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: v_alignbit_b32 v4, v31, v4, 16
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v29
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3
-; GCN-NEXT: v_alignbit_b32 v3, v0, v2, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_alignbit_b32 v13, v0, v1, 16
+; GCN-NEXT: v_alignbit_b32 v12, v6, v7, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v11, v0, v1, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v10, v0, v1, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_alignbit_b32 v9, v0, v1, 16
+; GCN-NEXT: v_alignbit_b32 v8, v6, v7, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v7, v0, v1, 16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_alignbit_b32 v6, v0, v1, 16
+; GCN-NEXT: v_alignbit_b32 v16, v16, v14, 16
+; GCN-NEXT: v_alignbit_b32 v15, v15, v17, 16
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v14, v0, v14, 16
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
-; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v30
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v15
-; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v9
-; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; GCN-NEXT: v_alignbit_b32 v2, v2, v7, 16
-; GCN-NEXT: v_alignbit_b32 v9, v6, v14, 16
-; GCN-NEXT: v_alignbit_b32 v8, v13, v12, 16
-; GCN-NEXT: v_alignbit_b32 v7, v11, v10, 16
-; GCN-NEXT: v_alignbit_b32 v6, v15, v16, 16
-; GCN-NEXT: v_alignbit_b32 v12, v28, v17, 16
-; GCN-NEXT: v_alignbit_b32 v11, v22, v23, 16
-; GCN-NEXT: v_alignbit_b32 v10, v25, v24, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v26
-; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13
-; GCN-NEXT: v_alignbit_b32 v13, v6, v27, 16
-; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v17
+; GCN-NEXT: v_alignbit_b32 v17, v6, v18, 16
+; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -1422,78 +1421,78 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
-; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_alignbit_b32 v25, v25, v24, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v5
; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13
; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
+; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_alignbit_b32 v11, v7, v10, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v30
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v27
-; GFX7-NEXT: v_alignbit_b32 v27, v29, v28, 16
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_alignbit_b32 v26, v31, v26, 16
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: v_alignbit_b32 v4, v24, v4, 16
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v14
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v28, v7, v6, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v9
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8
; GFX7-NEXT: v_alignbit_b32 v10, v6, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22
; GFX7-NEXT: v_alignbit_b32 v9, v6, v7, 16
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20
+; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v21
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v20
; GFX7-NEXT: v_alignbit_b32 v7, v6, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17
-; GFX7-NEXT: v_alignbit_b32 v8, v8, v14, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v16
+; GFX7-NEXT: v_alignbit_b32 v6, v6, v15, 16
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: v_alignbit_b32 v4, v31, v4, 16
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_alignbit_b32 v17, v14, v15, 16
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28
+; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26
+; GFX7-NEXT: v_alignbit_b32 v15, v14, v15, 16
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_alignbit_b32 v14, v14, v18, 16
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_store_dwordx4 v[25:28], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
@@ -1565,203 +1564,207 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v16
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[16:17], s[4:7], 0 addr64 offset:32
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v13
; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16
-; GCN-NEXT: v_alignbit_b32 v12, v16, v12, 16
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: v_alignbit_b32 v12, v18, v12, 16
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v29
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26
-; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16
; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16
-; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16
-; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
-; GCN-NEXT: v_alignbit_b32 v1, v22, v14, 16
-; GCN-NEXT: v_alignbit_b32 v0, v23, v0, 16
-; GCN-NEXT: v_alignbit_b32 v6, v26, v15, 16
-; GCN-NEXT: v_alignbit_b32 v5, v16, v17, 16
-; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136
-; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132
-; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128
-; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124
-; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108
-; GCN-NEXT: s_mov_b32 s4, s6
-; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: s_waitcnt vmcnt(6)
-; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:32
-; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104
-; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v30
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16
-; GCN-NEXT: s_waitcnt vmcnt(14)
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: s_waitcnt vmcnt(13)
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: s_waitcnt vmcnt(12)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: s_waitcnt vmcnt(11)
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: s_waitcnt vmcnt(10)
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: s_waitcnt vmcnt(7)
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: s_waitcnt vmcnt(6)
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v11
-; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12
-; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v12
-; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GCN-NEXT: v_alignbit_b32 v13, v7, v14, 16
-; GCN-NEXT: v_alignbit_b32 v12, v15, v16, 16
-; GCN-NEXT: v_alignbit_b32 v11, v17, v22, 16
-; GCN-NEXT: v_alignbit_b32 v10, v10, v23, 16
-; GCN-NEXT: v_alignbit_b32 v17, v20, v25, 16
-; GCN-NEXT: v_alignbit_b32 v16, v21, v18, 16
-; GCN-NEXT: v_alignbit_b32 v15, v26, v19, 16
-; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72
-; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:32
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20
-; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16
-; GCN-NEXT: s_waitcnt vmcnt(7)
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: s_waitcnt vmcnt(6)
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_alignbit_b32 v11, v8, v9, 16
+; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_alignbit_b32 v8, v8, v12, 16
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v21
-; GCN-NEXT: v_alignbit_b32 v14, v7, v14, 16
-; GCN-NEXT: v_alignbit_b32 v7, v18, v24, 16
-; GCN-NEXT: v_alignbit_b32 v21, v19, v20, 16
-; GCN-NEXT: v_alignbit_b32 v20, v25, v22, 16
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8
-; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4
-; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64
-; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
-; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52
-; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_alignbit_b32 v11, v8, v9, 16
+; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:68
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_alignbit_b32 v8, v8, v12, 16
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:32
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5
+; GCN-NEXT: v_alignbit_b32 v3, v1, v6, 16
+; GCN-NEXT: v_alignbit_b32 v2, v2, v4, 16
+; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12
+; GCN-NEXT: v_alignbit_b32 v1, v1, v13, 16
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v20
+; GCN-NEXT: v_alignbit_b32 v0, v4, v0, 16
+; GCN-NEXT: v_alignbit_b32 v6, v5, v19, 16
+; GCN-NEXT: v_alignbit_b32 v5, v13, v21, 16
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v22
+; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64
+; GCN-NEXT: s_waitcnt vmcnt(9)
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: s_waitcnt vmcnt(8)
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: s_waitcnt vmcnt(7)
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v23
-; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v7
; GCN-NEXT: s_waitcnt vmcnt(6)
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v10
; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
-; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v7
+; GCN-NEXT: v_alignbit_b32 v7, v8, v15, 16
+; GCN-NEXT: v_alignbit_b32 v11, v9, v20, 16
+; GCN-NEXT: v_alignbit_b32 v10, v21, v10, 16
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56
+; GCN-NEXT: s_waitcnt vmcnt(6)
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:48
+; GCN-NEXT: s_waitcnt vmcnt(6)
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_alignbit_b32 v8, v8, v14, 16
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v27
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
-; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GCN-NEXT: v_alignbit_b32 v25, v22, v23, 16
-; GCN-NEXT: v_alignbit_b32 v24, v24, v26, 16
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v29
-; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_alignbit_b32 v15, v14, v15, 16
+; GCN-NEXT: v_alignbit_b32 v14, v19, v12, 16
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; GCN-NEXT: v_alignbit_b32 v23, v23, v22, 16
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
-; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GCN-NEXT: v_alignbit_b32 v22, v22, v26, 16
-; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:112
-; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[8:9], s[4:7], 0 addr64 offset:96
-; GCN-NEXT: buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:80
-; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:64
-; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:48
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_alignbit_b32 v12, v12, v18, 16
+; GCN-NEXT: buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -1777,27 +1780,24 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104
; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100
; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: s_waitcnt vmcnt(7)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
@@ -1832,97 +1832,16 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76
; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:112
-; GFX7-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v37
-; GFX7-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v38
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v39
-; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v49
-; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v48
-; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v50
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
-; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
-; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68
-; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64
-; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60
-; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56
-; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52
-; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:48
-; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44
-; GFX7-NEXT: s_waitcnt vmcnt(7)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX7-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
-; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
-; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96
-; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39
-; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48
-; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50
-; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51
-; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
-; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
-; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36
-; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
-; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28
-; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24
-; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20
-; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16
-; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12
-; GFX7-NEXT: s_waitcnt vmcnt(7)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX7-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
-; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
-; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:80
-; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39
-; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48
-; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50
-; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51
-; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
-; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
-; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
-; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
-; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:64
; GFX7-NEXT: s_nop 0
; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v5
; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
+; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72
+; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68
+; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32
; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
@@ -1933,39 +1852,124 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8
; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22
-; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21
; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v20
+; GFX7-NEXT: v_alignbit_b32 v8, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18
-; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16
; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17
+; GFX7-NEXT: s_waitcnt vmcnt(9)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v37
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v28
+; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16
-; GFX7-NEXT: v_alignbit_b32 v6, v0, v1, 16
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v38
+; GFX7-NEXT: s_waitcnt vmcnt(9)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v38
+; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v39
+; GFX7-NEXT: v_alignbit_b32 v36, v0, v1, 16
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v49
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v48
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v50
+; GFX7-NEXT: v_alignbit_b32 v35, v18, v19, 16
+; GFX7-NEXT: v_alignbit_b32 v34, v0, v1, 16
+; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28
+; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24
+; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:16
+; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v33, v6, v14, 16
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16
+; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30
+; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v29
+; GFX7-NEXT: v_alignbit_b32 v17, v14, v15, 16
+; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:48
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26
+; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:44
+; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:8
+; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4
+; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56
+; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40
+; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_alignbit_b32 v15, v14, v15, 16
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25
+; GFX7-NEXT: v_alignbit_b32 v16, v16, v20, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v24
+; GFX7-NEXT: v_alignbit_b32 v14, v14, v20, 16
+; GFX7-NEXT: s_waitcnt vmcnt(14)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_alignbit_b32 v21, v0, v1, 16
+; GFX7-NEXT: s_waitcnt vmcnt(13)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_waitcnt vmcnt(12)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v19
+; GFX7-NEXT: v_alignbit_b32 v20, v0, v1, 16
+; GFX7-NEXT: s_waitcnt vmcnt(11)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v22
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_waitcnt vmcnt(10)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v23
+; GFX7-NEXT: v_alignbit_b32 v19, v0, v1, 16
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v35
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30
-; GFX7-NEXT: v_alignbit_b32 v17, v0, v1, 16
+; GFX7-NEXT: v_alignbit_b32 v18, v0, v1, 16
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v28
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v33
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v34
+; GFX7-NEXT: v_alignbit_b32 v25, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v26
-; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16
-; GFX7-NEXT: v_alignbit_b32 v15, v0, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v25
+; GFX7-NEXT: v_alignbit_b32 v24, v22, v23, 16
+; GFX7-NEXT: v_alignbit_b32 v23, v0, v1, 16
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v36
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v24
-; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16
-; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v37
+; GFX7-NEXT: v_alignbit_b32 v22, v0, v1, 16
+; GFX7-NEXT: buffer_store_dwordx4 v[22:25], v[31:32], s[4:7], 0 addr64 offset:80
+; GFX7-NEXT: buffer_store_dwordx4 v[18:21], v[31:32], s[4:7], 0 addr64 offset:64
; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32
; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16
@@ -3798,10 +3802,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v2, 1
; GCN-NEXT: v_readlane_b32 s30, v2, 0
-; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -3829,10 +3833,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v2, 1
; GFX7-NEXT: v_readlane_b32 s30, v2, 0
-; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -3858,10 +3862,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v2, 1
; GFX8-NEXT: v_readlane_b32 s30, v2, 0
-; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0xfc00
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -3887,10 +3891,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
; GFX9-NEXT: v_readlane_b32 s30, v2, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3917,11 +3921,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
-; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -3947,10 +3951,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
-; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -3990,10 +3994,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v4, 1
; GCN-NEXT: v_readlane_b32 s30, v4, 0
-; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -4026,10 +4030,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v4, 1
; GFX7-NEXT: v_readlane_b32 s30, v4, 0
-; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -4055,10 +4059,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v2, 1
; GFX8-NEXT: v_readlane_b32 s30, v2, 0
-; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0xfc00
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -4084,10 +4088,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
; GFX9-NEXT: v_readlane_b32 s30, v2, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4114,11 +4118,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
-; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -4144,10 +4148,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
-; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4189,10 +4193,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v5, 1
; GCN-NEXT: v_readlane_b32 s30, v5, 0
-; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -4227,10 +4231,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v4, 1
; GFX7-NEXT: v_readlane_b32 s30, v4, 0
-; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -4259,10 +4263,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v4, 1
; GFX8-NEXT: v_readlane_b32 s30, v4, 0
-; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0xfc00
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -4290,10 +4294,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-NEXT: v_readlane_b32 s30, v3, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4322,11 +4326,11 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v3, 1
; GFX10-NEXT: v_readlane_b32 s30, v3, 0
-; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -4354,10 +4358,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v3, 1
; GFX11-NEXT: v_readlane_b32 s30, v3, 0
-; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4407,10 +4411,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v8, 1
; GCN-NEXT: v_readlane_b32 s30, v8, 0
-; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -4453,10 +4457,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v6, 1
; GFX7-NEXT: v_readlane_b32 s30, v6, 0
-; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -4485,10 +4489,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v4, 1
; GFX8-NEXT: v_readlane_b32 s30, v4, 0
-; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0xfc00
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -4516,10 +4520,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-NEXT: v_readlane_b32 s30, v3, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4548,11 +4552,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v3, 1
; GFX10-NEXT: v_readlane_b32 s30, v3, 0
-; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -4578,10 +4582,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v3, 1
; GFX11-NEXT: v_readlane_b32 s30, v3, 0
-; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4651,10 +4655,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v16, 1
; GCN-NEXT: v_readlane_b32 s30, v16, 0
-; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -4717,10 +4721,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v10, 1
; GFX7-NEXT: v_readlane_b32 s30, v10, 0
-; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -4755,10 +4759,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v6, 1
; GFX8-NEXT: v_readlane_b32 s30, v6, 0
-; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0xfc00
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -4790,10 +4794,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v5, 1
; GFX9-NEXT: v_readlane_b32 s30, v5, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4826,11 +4830,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v5, 1
; GFX10-NEXT: v_readlane_b32 s30, v5, 0
-; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -4856,10 +4860,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v5, 1
; GFX11-NEXT: v_readlane_b32 s30, v5, 0
-; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4876,12 +4880,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_mov_b32 s18, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
-; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_writelane_b32 v20, s30, 0
-; GCN-NEXT: v_writelane_b32 v20, s31, 1
+; GCN-NEXT: v_writelane_b32 v21, s30, 0
+; GCN-NEXT: v_writelane_b32 v21, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
@@ -4907,36 +4911,36 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: v_add_i32_e32 v17, vcc, 30, v16
; GCN-NEXT: v_add_i32_e32 v18, vcc, 28, v16
; GCN-NEXT: v_add_i32_e32 v19, vcc, 26, v16
+; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v16
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v15, vcc, 24, v16
-; GCN-NEXT: v_add_i32_e32 v17, vcc, 22, v16
+; GCN-NEXT: v_add_i32_e32 v15, vcc, 22, v16
+; GCN-NEXT: v_add_i32_e32 v17, vcc, 20, v16
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: buffer_store_short v14, v18, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v14, vcc, 20, v16
-; GCN-NEXT: v_add_i32_e32 v18, vcc, 18, v16
+; GCN-NEXT: v_add_i32_e32 v14, vcc, 18, v16
+; GCN-NEXT: v_add_i32_e32 v18, vcc, 16, v16
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: buffer_store_short v13, v19, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v13, vcc, 16, v16
-; GCN-NEXT: v_add_i32_e32 v19, vcc, 14, v16
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 14, v16
+; GCN-NEXT: v_add_i32_e32 v19, vcc, 12, v16
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: buffer_store_short v12, v15, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v12, v20, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v12, vcc, 12, v16
-; GCN-NEXT: v_add_i32_e32 v15, vcc, 10, v16
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 10, v16
+; GCN-NEXT: v_add_i32_e32 v20, vcc, 8, v16
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT: buffer_store_short v11, v17, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v11, v15, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v11, vcc, 8, v16
-; GCN-NEXT: v_add_i32_e32 v17, vcc, 6, v16
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 6, v16
+; GCN-NEXT: v_add_i32_e32 v15, vcc, 4, v16
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: buffer_store_short v10, v14, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v10, v17, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v10, vcc, 4, v16
-; GCN-NEXT: v_add_i32_e32 v14, vcc, 2, v16
+; GCN-NEXT: v_add_i32_e32 v10, vcc, 2, v16
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@@ -4947,32 +4951,32 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT: buffer_store_short v9, v18, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v9, v14, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v8, v13, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v8, v18, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v7, v19, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v7, v13, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v6, v12, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v6, v19, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v5, v15, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v5, v12, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v4, v11, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v4, v20, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v3, v17, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v3, v11, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v2, v10, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v2, v15, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v1, v14, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v1, v10, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v20, 1
-; GCN-NEXT: v_readlane_b32 s30, v20, 0
-; GCN-NEXT: s_mov_b32 s32, s33
+; GCN-NEXT: v_readlane_b32 s31, v21, 1
+; GCN-NEXT: v_readlane_b32 s30, v21, 0
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -5075,10 +5079,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v18, 1
; GFX7-NEXT: v_readlane_b32 s30, v18, 0
-; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -5125,10 +5129,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v10, 1
; GFX8-NEXT: v_readlane_b32 s30, v10, 0
-; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8-NEXT: s_addk_i32 s32, 0xfc00
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -5168,10 +5172,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v9, 1
; GFX9-NEXT: v_readlane_b32 s30, v9, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -5212,11 +5216,11 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v9, 1
; GFX10-NEXT: v_readlane_b32 s30, v9, 0
-; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -5244,10 +5248,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v9, 1
; GFX11-NEXT: v_readlane_b32 s30, v9, 0
-; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -5361,10 +5365,10 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x50, v0
; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x44, v0
; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
@@ -5583,20 +5587,20 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:8
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
@@ -5613,11 +5617,11 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(25)
-; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT: s_waitcnt vmcnt(25)
-; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT: s_waitcnt vmcnt(18)
+; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT: s_waitcnt vmcnt(18)
+; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:116
; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -7614,197 +7618,197 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GCN-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26
; GCN-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28
; GCN-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30
-; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:48
-; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:50
-; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:52
-; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:54
-; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:56
-; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:58
-; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:60
-; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:62
+; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:50
+; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:52
+; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:54
+; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:56
+; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:58
+; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:60
+; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:62
; GCN-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34
; GCN-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36
; GCN-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38
-; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
-; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
+; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:40
+; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:42
; GCN-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
; GCN-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
; GCN-NEXT: s_waitcnt vmcnt(8)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xfc, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32
+; GCN-NEXT: v_add_i32_e32 v32, vcc, 0xfc, v0
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xf4, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31
+; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xf4, v0
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xec, v0
+; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xec, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xe4, v0
+; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xe4, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xdc, v0
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd8, v0
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xdc, v0
+; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xd8, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xd4, v0
-; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xd0, v0
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xcc, v0
+; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd4, v0
+; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xd0, v0
+; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xcc, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xc8, v0
+; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc8, v0
+; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xc4, v0
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xc0, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26
+; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xbc, v0
; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xc4, v0
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc0, v0
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xb8, v0
+; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xb4, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xbc, v0
-; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb8, v0
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xb4, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb0, v0
+; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xac, v0
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xa8, v0
+; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v23, vcc, 0xb0, v0
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xa4, v0
; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xac, v0
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa8, v0
-; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa0, v0
+; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x9c, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xa4, v0
+; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x98, v0
; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xa0, v0
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x9c, v0
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x94, v0
+; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x90, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x98, v0
-; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x94, v0
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x90, v0
+; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x8c, v0
+; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x88, v0
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x84, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x8c, v0
+; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x80, v0
; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x88, v0
-; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x84, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31
-; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x80, v0
-; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x7c, v0
-; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x7c, v0
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x78, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x74, v0
-; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x70, v0
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x6c, v0
+; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x70, v0
+; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x6c, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x68, v0
-; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x64, v0
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x60, v0
+; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x64, v0
+; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x5c, v0
-; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x58, v0
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x54, v0
+; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x58, v0
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0
-; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x4c, v0
-; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0
+; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x44, v0
-; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0
-; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0
+; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 64, v0
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0
-; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 48, v0
+; GCN-NEXT: v_add_i32_e32 v30, vcc, 48, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 44, v0
+; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v31, vcc, 44, v0
; GCN-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0
; GCN-NEXT: v_add_i32_e32 v33, vcc, 36, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0
-; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0
+; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 32, v0
+; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0
; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v20, vcc, 20, v0
-; GCN-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 16, v0
+; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v24, vcc, 16, v0
; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 8, v0
+; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 8, v0
; GCN-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v19, vcc, 4, v0
; GCN-NEXT: s_waitcnt expcnt(0)
@@ -7820,34 +7824,34 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v8
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v11
-; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen
; GCN-NEXT: v_cvt_f64_f32_e32 v[5:6], v10
-; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v9
; GCN-NEXT: v_cvt_f64_f32_e32 v[7:8], v12
-; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v36
+; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v13
; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen
-; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v13
-; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen
+; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v36
+; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v14
; GCN-NEXT: v_cvt_f64_f32_e32 v[13:14], v15
; GCN-NEXT: v_cvt_f64_f32_e32 v[15:16], v16
-; GCN-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v6, v27, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v17, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v10, v29, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v9, v21, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v12, v31, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v11, v21, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v15, v23, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v14, v30, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v15, v25, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v13, v34, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v4, v20, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v12, v35, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v3, v24, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v10, v35, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v9, v28, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v8, v19, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -7860,258 +7864,258 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:62
-; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:60
-; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:58
-; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:56
-; GFX7-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:54
-; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:52
-; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:50
-; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:34
-; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:36
-; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:38
-; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:40
-; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:42
-; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:44
-; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:46
-; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:2
-; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:6
-; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:8
-; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10
-; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:12
+; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:62
+; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:60
+; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:58
+; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:56
+; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:54
+; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:52
+; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:50
+; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:34
+; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:36
+; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:38
+; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
+; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
+; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
+; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
+; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:2
+; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:4
+; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:6
+; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:8
+; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:10
+; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:12
; GFX7-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14
-; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:18
-; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:20
-; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:22
-; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:24
-; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:26
-; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:28
-; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:30
+; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:18
+; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:20
+; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:22
+; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:24
+; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:26
+; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:28
+; GFX7-NEXT: buffer_load_ushort v1, v[1:2], s[4:7], 0 addr64 offset:30
; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v17
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xfc, v0
-; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v20
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xfc, v0
+; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0
-; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v18
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xf4, v0
-; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xd8, v0
-; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf4, v0
+; GFX7-NEXT: v_add_i32_e32 v22, vcc, 0xd8, v0
+; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0
-; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v19
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xec, v0
-; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0xd4, v0
-; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v23
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xec, v0
+; GFX7-NEXT: s_waitcnt vmcnt(14)
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0
-; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v20
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xe4, v0
-; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0xd0, v0
-; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v24
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe4, v0
+; GFX7-NEXT: v_add_i32_e32 v24, vcc, 0xd0, v0
+; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v21
-; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v17
-; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xdc, v0
-; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22
-; GFX7-NEXT: buffer_store_dword v1, v18, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v23
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
-; GFX7-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v1, v20, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xcc, v0
-; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v24
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xc8, v0
-; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xc4, v0
-; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v31
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v25
+; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v21
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xdc, v0
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v26
+; GFX7-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v27
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xd4, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
+; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v20, v24, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v28
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xcc, v0
+; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc8, v0
+; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc4, v0
+; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v34
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc0, v0
-; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xbc, v0
-; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v30
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xb8, v0
-; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xb4, v0
-; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v29
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
+; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xbc, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v33
+; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v20
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb8, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v32
+; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb4, v0
+; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb0, v0
-; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xac, v0
-; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v28
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xa8, v0
-; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xa4, v0
-; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v27
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
+; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xac, v0
+; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v31
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa8, v0
+; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa4, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v30
+; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v20
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa0, v0
-; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x9c, v0
-; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v26
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x98, v0
-; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x94, v0
-; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v25
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v29
+; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x9c, v0
+; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x98, v0
+; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x94, v0
+; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x90, v0
-; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x8c, v0
-; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x88, v0
-; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v16
+; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v18
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x8c, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v19
+; GFX7-NEXT: buffer_store_dword v21, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x88, v0
+; GFX7-NEXT: buffer_store_dword v20, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v15
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x84, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v17
+; GFX7-NEXT: buffer_store_dword v21, v15, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x80, v0
+; GFX7-NEXT: buffer_store_dword v20, v15, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x84, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v32
-; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x80, v0
+; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x7c, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: buffer_store_dword v2, v15, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v16
-; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v34
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
-; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x7c, v0
-; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x74, v0
-; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0
-; GFX7-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v33
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
-; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x70, v0
-; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v13
-; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v14
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v13
-; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x6c, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: buffer_store_dword v14, v19, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v14, vcc, 0x68, v0
-; GFX7-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v11
-; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v12
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
-; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x64, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v12, vcc, 0x60, v0
-; GFX7-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v9
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v10
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
-; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x5c, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v16
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
+; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x74, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x70, v0
+; GFX7-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x6c, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x58, v0
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v7
-; GFX7-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v13, vcc, 0x68, v0
+; GFX7-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v10
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
+; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x64, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v8
+; GFX7-NEXT: v_add_i32_e32 v11, vcc, 0x60, v0
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x5c, v0
+; GFX7-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: buffer_store_dword v16, v8, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v5
; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
-; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0
-; GFX7-NEXT: buffer_store_dword v20, v7, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0
-; GFX7-NEXT: buffer_store_dword v19, v7, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0
-; GFX7-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0
; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v10
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[5:6], v16
+; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x4c, v0
+; GFX7-NEXT: buffer_store_dword v4, v16, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0
+; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v11
+; GFX7-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
+; GFX7-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 64, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 60, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 64, v0
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 56, v0
; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 60, v0
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GFX7-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 56, v0
-; GFX7-NEXT: buffer_store_dword v19, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 52, v0
-; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0
-; GFX7-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 44, v0
-; GFX7-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 40, v0
-; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 36, v0
-; GFX7-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX7-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 28, v0
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
-; GFX7-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 24, v0
-; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 20, v0
-; GFX7-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 16, v0
-; GFX7-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 12, v0
-; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 8, v0
-; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0
; GFX7-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 12, v0
+; GFX7-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0
+; GFX7-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v0
+; GFX7-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v32bf16_to_v32f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 2, v1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 2, v1
+; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v1
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 6, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 6, v1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 8, v1
; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 8, v1
-; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, 10, v1
; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v13, vcc, 12, v1
; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v15, vcc, 14, v1
-; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, 16, v1
-; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 18, v1
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 14, v1
; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v21, vcc, 20, v1
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, 16, v1
; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, 18, v1
+; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, 20, v1
+; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, 22, v1
; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v25, vcc, 24, v1
@@ -8122,473 +8126,469 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_addc_u32_e32 v30, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v31, vcc, 30, v1
; GFX8-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v33, vcc, 34, v1
+; GFX8-NEXT: v_add_u32_e32 v33, vcc, 32, v1
; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1
+; GFX8-NEXT: v_add_u32_e32 v35, vcc, 34, v1
; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
-; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1
-; GFX8-NEXT: flat_load_ushort v44, v[1:2]
+; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8-NEXT: v_add_u32_e32 v37, vcc, 36, v1
+; GFX8-NEXT: flat_load_ushort v43, v[1:2]
; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1
+; GFX8-NEXT: v_add_u32_e32 v48, vcc, 38, v1
; GFX8-NEXT: v_addc_u32_e32 v49, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v50, vcc, 62, v1
; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v45, v[50:51]
+; GFX8-NEXT: flat_load_ushort v44, v[50:51]
; GFX8-NEXT: v_add_u32_e32 v50, vcc, 60, v1
; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v46, v[50:51]
-; GFX8-NEXT: v_add_u32_e32 v50, vcc, 42, v1
+; GFX8-NEXT: flat_load_ushort v45, v[50:51]
+; GFX8-NEXT: v_add_u32_e32 v50, vcc, 40, v1
; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v52, vcc, 58, v1
; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v47, v[52:53]
-; GFX8-NEXT: v_add_u32_e32 v52, vcc, 44, v1
+; GFX8-NEXT: flat_load_ushort v46, v[52:53]
+; GFX8-NEXT: v_add_u32_e32 v52, vcc, 42, v1
; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v54, vcc, 56, v1
; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v56, v[54:55]
-; GFX8-NEXT: v_add_u32_e32 v54, vcc, 46, v1
+; GFX8-NEXT: flat_load_ushort v47, v[54:55]
+; GFX8-NEXT: v_add_u32_e32 v54, vcc, 44, v1
; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v39, vcc, 54, v1
; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v57, v[39:40]
+; GFX8-NEXT: flat_load_ushort v56, v[39:40]
; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1
; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v58, v[39:40]
-; GFX8-NEXT: v_add_u32_e32 v40, vcc, 48, v1
-; GFX8-NEXT: v_addc_u32_e32 v41, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v42, vcc, 50, v1
-; GFX8-NEXT: v_addc_u32_e32 v43, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v42, v[42:43]
-; GFX8-NEXT: flat_load_ushort v34, v[33:34]
-; GFX8-NEXT: flat_load_ushort v36, v[35:36]
-; GFX8-NEXT: flat_load_ushort v38, v[37:38]
-; GFX8-NEXT: flat_load_ushort v39, v[48:49]
-; GFX8-NEXT: flat_load_ushort v48, v[50:51]
-; GFX8-NEXT: flat_load_ushort v51, v[52:53]
-; GFX8-NEXT: flat_load_ushort v52, v[54:55]
-; GFX8-NEXT: flat_load_ushort v53, v[40:41]
-; GFX8-NEXT: v_add_u32_e32 v49, vcc, 32, v1
-; GFX8-NEXT: v_addc_u32_e32 v50, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v37, v[3:4]
-; GFX8-NEXT: flat_load_ushort v35, v[5:6]
-; GFX8-NEXT: flat_load_ushort v33, v[7:8]
-; GFX8-NEXT: flat_load_ushort v8, v[9:10]
-; GFX8-NEXT: flat_load_ushort v6, v[11:12]
-; GFX8-NEXT: flat_load_ushort v4, v[13:14]
-; GFX8-NEXT: flat_load_ushort v2, v[15:16]
-; GFX8-NEXT: flat_load_ushort v1, v[19:20]
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, 4, v0
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0x7c, v0
-; GFX8-NEXT: s_waitcnt vmcnt(14)
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v44
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v3
-; GFX8-NEXT: flat_load_ushort v3, v[17:18]
-; GFX8-NEXT: flat_load_ushort v5, v[21:22]
-; GFX8-NEXT: flat_load_ushort v7, v[23:24]
-; GFX8-NEXT: flat_load_ushort v9, v[25:26]
-; GFX8-NEXT: flat_load_ushort v10, v[27:28]
-; GFX8-NEXT: flat_load_ushort v11, v[29:30]
+; GFX8-NEXT: flat_load_ushort v57, v[39:40]
+; GFX8-NEXT: v_add_u32_e32 v39, vcc, 46, v1
+; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v41, vcc, 50, v1
+; GFX8-NEXT: v_addc_u32_e32 v42, vcc, 0, v2, vcc
+; GFX8-NEXT: flat_load_ushort v41, v[41:42]
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v1
+; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT: flat_load_ushort v42, v[9:10]
+; GFX8-NEXT: flat_load_ushort v9, v[35:36]
+; GFX8-NEXT: flat_load_ushort v10, v[37:38]
+; GFX8-NEXT: flat_load_ushort v35, v[48:49]
+; GFX8-NEXT: flat_load_ushort v36, v[50:51]
+; GFX8-NEXT: flat_load_ushort v37, v[52:53]
+; GFX8-NEXT: flat_load_ushort v48, v[54:55]
+; GFX8-NEXT: flat_load_ushort v39, v[39:40]
+; GFX8-NEXT: flat_load_ushort v49, v[1:2]
+; GFX8-NEXT: flat_load_ushort v50, v[3:4]
+; GFX8-NEXT: flat_load_ushort v51, v[5:6]
+; GFX8-NEXT: flat_load_ushort v52, v[7:8]
+; GFX8-NEXT: flat_load_ushort v53, v[11:12]
+; GFX8-NEXT: flat_load_ushort v38, v[13:14]
+; GFX8-NEXT: flat_load_ushort v14, v[17:18]
+; GFX8-NEXT: flat_load_ushort v11, v[21:22]
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v0
+; GFX8-NEXT: flat_load_ushort v15, v[15:16]
+; GFX8-NEXT: flat_load_ushort v13, v[19:20]
+; GFX8-NEXT: flat_load_ushort v8, v[23:24]
+; GFX8-NEXT: flat_load_ushort v6, v[25:26]
+; GFX8-NEXT: flat_load_ushort v5, v[27:28]
+; GFX8-NEXT: flat_load_ushort v7, v[29:30]
; GFX8-NEXT: flat_load_ushort v12, v[31:32]
-; GFX8-NEXT: flat_load_ushort v13, v[49:50]
-; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0x84, v0
-; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
-; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xfc, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v45
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
-; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v46
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf8, v0
-; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf4, v0
-; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v47
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xf0, v0
-; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xec, v0
-; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xe8, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v56
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
-; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe4, v0
-; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe0, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v57
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
-; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xdc, v0
-; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v58
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd8, v0
-; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xd4, v0
-; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v42
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd0, v0
-; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xcc, v0
-; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
+; GFX8-NEXT: flat_load_ushort v16, v[33:34]
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xc4, v0
+; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xbc, v0
+; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xb4, v0
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xac, v0
+; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0xa4, v0
+; GFX8-NEXT: v_add_u32_e32 v27, vcc, 0x9c, v0
; GFX8-NEXT: s_waitcnt vmcnt(14)
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v53
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc8, v0
-; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc4, v0
-; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v52
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xc0, v0
-; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xbc, v0
-; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v51
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xb8, v0
-; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb4, v0
-; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v48
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb0, v0
-; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xac, v0
-; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v39
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa8, v0
-; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xa4, v0
-; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v38
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa0, v0
-; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x9c, v0
-; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v36
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x98, v0
-; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x94, v0
-; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x90, v0
-; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v34
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x8c, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v37
-; GFX8-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x88, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v16
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13
-; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v35
-; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x80, v0
-; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
-; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v33
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x78, v0
-; GFX8-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v18
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v11
-; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x74, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x70, v0
-; GFX8-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v10
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v8
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x6c, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x68, v0
-; GFX8-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v9
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x64, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x60, v0
-; GFX8-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v4
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x5c, v0
-; GFX8-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x58, v0
-; GFX8-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v43
; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x54, v0
-; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
+; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xfc, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v44
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v45
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xf8, v0
+; GFX8-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xf4, v0
+; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v46
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xf0, v0
+; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xec, v0
+; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xe8, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v47
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX8-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe4, v0
+; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe0, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v56
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
+; GFX8-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xdc, v0
+; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v57
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xd8, v0
+; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xd4, v0
+; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xd0, v0
+; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v41
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xcc, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v42
+; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xc8, v0
; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v3
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v49
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v50
+; GFX8-NEXT: s_waitcnt vmcnt(14)
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v51
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v52
+; GFX8-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xc0, v0
+; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v39
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v53
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v38
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xb8, v0
+; GFX8-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[17:18], v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v48
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xb0, v0
+; GFX8-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[19:20], v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v37
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xa8, v0
+; GFX8-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[21:22], v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v36
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v23
+; GFX8-NEXT: buffer_store_dword v24, v26, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xa0, v0
+; GFX8-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v25
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v35
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[25:26], v25
+; GFX8-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v10
+; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0x98, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v11
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x94, v0
+; GFX8-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x90, v0
+; GFX8-NEXT: buffer_store_dword v27, v11, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[25:26], v14
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x8c, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v15
+; GFX8-NEXT: buffer_store_dword v28, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x88, v0
+; GFX8-NEXT: buffer_store_dword v27, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v16
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v13
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x84, v0
+; GFX8-NEXT: buffer_store_dword v28, v13, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x80, v0
+; GFX8-NEXT: buffer_store_dword v27, v13, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v12
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v9
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7c, v0
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
+; GFX8-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x78, v0
+; GFX8-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x74, v0
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
+; GFX8-NEXT: buffer_store_dword v7, v13, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x70, v0
+; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x6c, v0
+; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x68, v0
+; GFX8-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x64, v0
+; GFX8-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x60, v0
+; GFX8-NEXT: buffer_store_dword v12, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x5c, v0
+; GFX8-NEXT: buffer_store_dword v9, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x58, v0
+; GFX8-NEXT: buffer_store_dword v8, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x54, v0
+; GFX8-NEXT: buffer_store_dword v28, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x50, v0
+; GFX8-NEXT: buffer_store_dword v27, v5, s[0:3], 0 offen
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0
-; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0
-; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0
-; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 64, v0
-; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 60, v0
-; GFX8-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0
-; GFX8-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 52, v0
-; GFX8-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v0
-; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 44, v0
-; GFX8-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 40, v0
-; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 36, v0
-; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0
-; GFX8-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0
-; GFX8-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 24, v0
-; GFX8-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 20, v0
-; GFX8-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0
-; GFX8-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0
+; GFX8-NEXT: buffer_store_dword v15, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x48, v0
+; GFX8-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x44, v0
+; GFX8-NEXT: buffer_store_dword v11, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 64, v0
+; GFX8-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 60, v0
+; GFX8-NEXT: buffer_store_dword v26, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 56, v0
+; GFX8-NEXT: buffer_store_dword v25, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 52, v0
+; GFX8-NEXT: buffer_store_dword v24, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 48, v0
+; GFX8-NEXT: buffer_store_dword v23, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 44, v0
+; GFX8-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 40, v0
+; GFX8-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 36, v0
+; GFX8-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v0
+; GFX8-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 28, v0
+; GFX8-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 24, v0
+; GFX8-NEXT: buffer_store_dword v17, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 20, v0
+; GFX8-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 12, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
-; GFX8-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen
-; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v32bf16_to_v32f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:62
-; GFX9-NEXT: global_load_ushort v10, v[1:2], off offset:60
-; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:58
-; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:56
-; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:54
-; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:52
-; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:50
-; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:48
-; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:46
-; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:44
-; GFX9-NEXT: global_load_ushort v19, v[1:2], off offset:42
-; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:40
-; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:38
-; GFX9-NEXT: global_load_ushort v22, v[1:2], off offset:36
-; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:34
-; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:32
-; GFX9-NEXT: global_load_ushort v25, v[1:2], off
-; GFX9-NEXT: global_load_ushort v26, v[1:2], off offset:2
-; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:30
+; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:62
+; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:60
+; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:58
+; GFX9-NEXT: global_load_ushort v25, v[1:2], off offset:56
+; GFX9-NEXT: global_load_ushort v26, v[1:2], off offset:54
+; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:52
+; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:50
+; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:48
+; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:46
+; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:44
+; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:42
+; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:40
+; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:38
+; GFX9-NEXT: global_load_ushort v19, v[1:2], off
+; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:36
+; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:2
+; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:4
+; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:34
+; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:32
+; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:6
+; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:8
+; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:30
; GFX9-NEXT: global_load_ushort v3, v[1:2], off offset:16
; GFX9-NEXT: global_load_ushort v4, v[1:2], off offset:18
; GFX9-NEXT: global_load_ushort v5, v[1:2], off offset:20
; GFX9-NEXT: global_load_ushort v6, v[1:2], off offset:22
-; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:24
-; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:26
-; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:28
-; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:4
-; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:6
-; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:8
-; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:10
+; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:24
+; GFX9-NEXT: global_load_ushort v10, v[1:2], off offset:26
+; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:28
+; GFX9-NEXT: global_load_ushort v9, v[1:2], off offset:10
; GFX9-NEXT: global_load_ushort v7, v[1:2], off offset:12
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_ushort v1, v[1:2], off offset:14
; GFX9-NEXT: s_waitcnt vmcnt(31)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v8
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v21
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2
; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v23
; GFX9-NEXT: s_waitcnt vmcnt(28)
-; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v12
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:252
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:248
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v25
+; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:252
+; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:248
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v24
; GFX9-NEXT: s_waitcnt vmcnt(29)
-; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v13
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:244
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:240
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v26
+; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:244
+; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:240
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2
; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:236
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:232
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v27
+; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:236
+; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:232
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v23
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[23:24], v24
; GFX9-NEXT: s_waitcnt vmcnt(31)
-; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v28
; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v16
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:228
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:224
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
-; GFX9-NEXT: s_waitcnt vmcnt(31)
-; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v13
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:220
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:216
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v14
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
-; GFX9-NEXT: s_waitcnt vmcnt(32)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v18
-; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v20
+; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v29
+; GFX9-NEXT: s_waitcnt vmcnt(29)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v30
+; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
+; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v25
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[25:26], v26
+; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:220
+; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:216
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[23:24], v27
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[27:28], v2
; GFX9-NEXT: s_waitcnt vmcnt(28)
-; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:212
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:208
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v19
-; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v21
-; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:204
-; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:200
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:196
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:192
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v20
-; GFX9-NEXT: s_waitcnt vmcnt(33)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v19
+; GFX9-NEXT: s_waitcnt vmcnt(27)
+; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v20
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
+; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v31
+; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v32
+; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v33
+; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v34
+; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:212
+; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:208
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v29
+; GFX9-NEXT: s_waitcnt vmcnt(26)
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[29:30], v30
+; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:204
+; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:200
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[25:26], v31
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[31:32], v32
+; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:196
+; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:192
+; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:188
+; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:184
+; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:180
+; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:176
+; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:172
+; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:168
+; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:164
+; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:160
+; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:156
+; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:152
+; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v17
; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v18
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v19
-; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:188
-; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:184
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:180
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:176
-; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:172
-; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:168
-; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:164
-; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:160
-; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:156
-; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:152
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:148
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:144
-; GFX9-NEXT: s_waitcnt vmcnt(44)
-; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v24
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:140
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:136
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10
-; GFX9-NEXT: s_waitcnt vmcnt(43)
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v27
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:132
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
-; GFX9-NEXT: s_waitcnt vmcnt(38)
-; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v30
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v14
-; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v29
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v16
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v25
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v26
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v2
-; GFX9-NEXT: s_waitcnt vmcnt(41)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v31
-; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v28
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v2
+; GFX9-NEXT: s_waitcnt vmcnt(39)
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148
+; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144
; GFX9-NEXT: s_waitcnt vmcnt(40)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v32
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v18
+; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v13
+; GFX9-NEXT: s_waitcnt vmcnt(39)
+; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v14
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v11
+; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:140
+; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:136
; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v2
-; GFX9-NEXT: s_waitcnt vmcnt(41)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v33
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v2
; GFX9-NEXT: s_waitcnt vmcnt(40)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v34
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v15
+; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:132
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v2
+; GFX9-NEXT: s_waitcnt vmcnt(34)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v12
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v10
+; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:112
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: s_waitcnt vmcnt(38)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v9
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
+; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:108
+; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:104
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
+; GFX9-NEXT: s_waitcnt vmcnt(39)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT: s_waitcnt vmcnt(38)
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v2
-; GFX9-NEXT: s_waitcnt vmcnt(41)
-; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v7
-; GFX9-NEXT: s_waitcnt vmcnt(40)
-; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v3
-; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v22
-; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
+; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:88
+; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:84
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v18
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v21
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v22
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[22:23], v23
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v12
+; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:76
+; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:52
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:48
+; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:40
+; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:36
+; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:32
+; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -8612,177 +8612,179 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX10-NEXT: global_load_ushort v16, v[1:2], off offset:26
; GFX10-NEXT: global_load_ushort v17, v[1:2], off offset:28
; GFX10-NEXT: global_load_ushort v18, v[1:2], off offset:30
-; GFX10-NEXT: global_load_ushort v19, v[1:2], off offset:62
-; GFX10-NEXT: global_load_ushort v20, v[1:2], off offset:32
-; GFX10-NEXT: global_load_ushort v21, v[1:2], off offset:34
-; GFX10-NEXT: global_load_ushort v22, v[1:2], off offset:36
-; GFX10-NEXT: global_load_ushort v23, v[1:2], off offset:60
-; GFX10-NEXT: global_load_ushort v24, v[1:2], off offset:38
-; GFX10-NEXT: global_load_ushort v25, v[1:2], off offset:40
-; GFX10-NEXT: global_load_ushort v26, v[1:2], off offset:58
-; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:42
-; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:44
-; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:56
-; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:46
-; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:48
-; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:54
-; GFX10-NEXT: global_load_ushort v33, v[1:2], off offset:50
-; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:52
+; GFX10-NEXT: global_load_ushort v19, v[1:2], off offset:32
+; GFX10-NEXT: global_load_ushort v20, v[1:2], off offset:34
+; GFX10-NEXT: global_load_ushort v21, v[1:2], off offset:36
+; GFX10-NEXT: global_load_ushort v22, v[1:2], off offset:38
+; GFX10-NEXT: global_load_ushort v23, v[1:2], off offset:40
+; GFX10-NEXT: global_load_ushort v24, v[1:2], off offset:42
+; GFX10-NEXT: global_load_ushort v25, v[1:2], off offset:44
+; GFX10-NEXT: global_load_ushort v26, v[1:2], off offset:46
+; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:48
+; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:62
+; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:50
+; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:52
+; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:54
+; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:60
+; GFX10-NEXT: global_load_ushort v33, v[1:2], off offset:56
+; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:58
; GFX10-NEXT: s_waitcnt vmcnt(31)
-; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: s_waitcnt vmcnt(30)
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v4
; GFX10-NEXT: s_waitcnt vmcnt(29)
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v5
; GFX10-NEXT: s_waitcnt vmcnt(28)
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v6
; GFX10-NEXT: s_waitcnt vmcnt(27)
-; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
; GFX10-NEXT: s_waitcnt vmcnt(26)
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v8
; GFX10-NEXT: s_waitcnt vmcnt(25)
-; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v9
; GFX10-NEXT: s_waitcnt vmcnt(24)
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v10
; GFX10-NEXT: s_waitcnt vmcnt(23)
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v11
; GFX10-NEXT: s_waitcnt vmcnt(22)
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v12
; GFX10-NEXT: s_waitcnt vmcnt(21)
-; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v13
; GFX10-NEXT: s_waitcnt vmcnt(20)
-; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v14
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v35
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v36
-; GFX10-NEXT: s_waitcnt vmcnt(17)
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v17
-; GFX10-NEXT: s_waitcnt vmcnt(16)
-; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v14
+; GFX10-NEXT: s_waitcnt vmcnt(19)
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v15
+; GFX10-NEXT: s_waitcnt vmcnt(18)
+; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v16
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v37
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v38
; GFX10-NEXT: s_waitcnt vmcnt(15)
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v19
; GFX10-NEXT: s_waitcnt vmcnt(14)
; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v20
; GFX10-NEXT: s_waitcnt vmcnt(13)
-; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v21
; GFX10-NEXT: s_waitcnt vmcnt(12)
-; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
; GFX10-NEXT: s_waitcnt vmcnt(11)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v23
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v23
+; GFX10-NEXT: s_waitcnt vmcnt(10)
+; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v24
; GFX10-NEXT: s_waitcnt vmcnt(9)
-; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v25
; GFX10-NEXT: s_waitcnt vmcnt(8)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v26
; GFX10-NEXT: s_waitcnt vmcnt(7)
-; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v27
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v82, 16, v27
+; GFX10-NEXT: s_waitcnt vmcnt(6)
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v28
; GFX10-NEXT: s_waitcnt vmcnt(5)
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v83, 16, v29
; GFX10-NEXT: s_waitcnt vmcnt(4)
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v30
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v84, 16, v30
+; GFX10-NEXT: s_waitcnt vmcnt(3)
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v31
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v32
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v33
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v32
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v34
-; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v31
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
-; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v24
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v71
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v68
-; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v16
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v70
-; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v34
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v33
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v29
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v84
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v13
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v50
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v51
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v82
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v52
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v53
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[52:53], v80
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v35
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v36
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v48
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v49
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v54
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v55
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[54:55], v70
+; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v18
; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252
; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:248
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v23
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v37
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v38
-; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244
-; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:240
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v25
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v66
-; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236
-; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:232
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v27
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v48
-; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228
-; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:224
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v81
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v49
-; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220
-; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v80
-; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:212
-; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:208
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v69
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v64
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v50
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v51
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v54
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v83
+; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v17
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
+; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:244
+; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:240
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v81
+; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:236
+; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:232
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v71
+; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
+; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v65
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[64:65], v64
+; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:220
+; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:216
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v67
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[66:67], v66
+; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:212
+; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:208
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v69
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v39
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[68:69], v68
; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:204
; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:200
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v67
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v39
-; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:196
-; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:192
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v65
+; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:196
+; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:192
; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:188
; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:184
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v55
-; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:180
-; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:176
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v53
-; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:172
-; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:168
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v52
-; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:164
-; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:160
-; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:156
-; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:152
-; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148
-; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144
-; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:140
-; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:136
-; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:132
-; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:180
+; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:176
+; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:172
+; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:168
+; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:164
+; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:160
+; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:156
+; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:152
+; GFX10-NEXT: buffer_store_dword v65, v0, s[0:3], 0 offen offset:148
+; GFX10-NEXT: buffer_store_dword v64, v0, s[0:3], 0 offen offset:144
+; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:140
+; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:136
+; GFX10-NEXT: buffer_store_dword v67, v0, s[0:3], 0 offen offset:132
+; GFX10-NEXT: buffer_store_dword v66, v0, s[0:3], 0 offen offset:128
; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:124
; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:120
-; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:116
-; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112
+; GFX10-NEXT: buffer_store_dword v69, v0, s[0:3], 0 offen offset:116
+; GFX10-NEXT: buffer_store_dword v68, v0, s[0:3], 0 offen offset:112
; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:108
; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:104
-; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100
-; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96
-; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:92
-; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:88
-; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:84
-; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:80
-; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:76
-; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:72
-; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:68
-; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:64
-; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:60
-; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:56
-; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:52
-; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:48
-; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44
-; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40
-; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36
-; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32
-; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28
-; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24
-; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20
-; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
-; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12
-; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8
-; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:4
-; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen
+; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:100
+; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:96
+; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:92
+; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:88
+; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:84
+; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:80
+; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:76
+; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:72
+; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:68
+; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:64
+; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:60
+; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:56
+; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:52
+; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:48
+; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:44
+; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:40
+; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:36
+; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32
+; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:28
+; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:24
+; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:20
+; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:16
+; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:12
+; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v32bf16_to_v32f64:
@@ -9339,7 +9341,7 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fadd_v3bf16:
@@ -10057,47 +10059,55 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_add_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_add_f32_e32 v11, v11, v27
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_add_f32_e32 v10, v10, v26
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_add_f32_e32 v9, v9, v25
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_add_f32_e32 v8, v8, v24
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_add_f32_e32 v7, v7, v23
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_add_f32_e32 v6, v6, v22
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_add_f32_e32 v5, v5, v21
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_add_f32_e32 v4, v4, v20
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
@@ -10106,14 +10116,6 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_add_f32_e32 v10, v10, v26
-; GCN-NEXT: v_add_f32_e32 v9, v9, v25
-; GCN-NEXT: v_add_f32_e32 v8, v8, v24
-; GCN-NEXT: v_add_f32_e32 v7, v7, v23
-; GCN-NEXT: v_add_f32_e32 v6, v6, v22
-; GCN-NEXT: v_add_f32_e32 v5, v5, v21
-; GCN-NEXT: v_add_f32_e32 v4, v4, v20
; GCN-NEXT: v_add_f32_e32 v3, v3, v19
; GCN-NEXT: v_add_f32_e32 v2, v2, v18
; GCN-NEXT: v_add_f32_e32 v1, v1, v17
@@ -10133,7 +10135,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_add_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
@@ -10143,22 +10145,20 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_fadd_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_add_f32_e32 v11, v11, v27
-; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v22
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
@@ -10169,24 +10169,25 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
@@ -10211,6 +10212,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_add_f32_e32 v14, v14, v30
; GFX7-NEXT: v_add_f32_e32 v13, v13, v29
; GFX7-NEXT: v_add_f32_e32 v12, v12, v28
+; GFX7-NEXT: v_add_f32_e32 v11, v11, v27
; GFX7-NEXT: v_add_f32_e32 v10, v10, v26
; GFX7-NEXT: v_add_f32_e32 v9, v9, v25
; GFX7-NEXT: v_add_f32_e32 v8, v8, v24
@@ -10229,7 +10231,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_add_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
@@ -11687,10 +11689,10 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
@@ -11993,278 +11995,278 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_add_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
-; GFX10-NEXT: v_add_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_add_f32_e32 v11, v11, v27
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
-; GFX10-NEXT: v_add_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
-; GFX10-NEXT: v_add_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_add_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
-; GFX10-NEXT: v_add_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_add_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
-; GFX10-NEXT: v_add_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_add_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
-; GFX10-NEXT: v_add_f32_e32 v27, v50, v27
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_add_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
-; GFX10-NEXT: v_add_f32_e32 v29, v38, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT: v_add_f32_e32 v39, v48, v39
+; GFX10-NEXT: v_add_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_add_f32_e32 v49, v50, v49
+; GFX10-NEXT: v_add_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT: v_add_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT: v_add_f32_e32 v12, v12, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_add_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
-; GFX10-NEXT: v_add_f32_e32 v28, v48, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_add_f32_e32 v10, v10, v26
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
-; GFX10-NEXT: v_add_f32_e32 v34, v34, v51
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_add_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
-; GFX10-NEXT: v_add_f32_e32 v30, v36, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
+; GFX10-NEXT: v_add_f32_e32 v25, v54, v53
+; GFX10-NEXT: v_add_f32_e32 v8, v8, v24
+; GFX10-NEXT: v_add_f32_e32 v24, v64, v55
+; GFX10-NEXT: v_add_f32_e32 v7, v7, v23
+; GFX10-NEXT: v_add_f32_e32 v23, v66, v65
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v22
+; GFX10-NEXT: v_add_f32_e32 v22, v68, v67
+; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
+; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
+; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
+; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT: v_add_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT: v_add_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_add_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_add_f32_e32 v18, v48, v23
+; GFX10-NEXT: v_add_f32_e32 v18, v27, v48
; GFX10-NEXT: v_add_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_add_f32_e32 v17, v50, v22
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
-; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT: v_add_f32_e32 v17, v26, v50
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
+; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
+; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
+; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
+; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
+; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_add_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_add_f32_e32 v20, v36, v25
+; GFX10-NEXT: v_add_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT: v_add_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_add_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_add_f32_e32 v19, v38, v24
-; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
-; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
-; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_add_f32_e32 v19, v28, v38
+; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
+; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
+; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
+; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
+; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
+; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v51, v52, v51
; GFX10-NEXT: v_add_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_add_f32_e32 v21, v51, v26
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
-; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
-; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
-; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
-; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
-; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
-; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
-; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
-; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
-; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
-; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
-; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
-; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
-; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
-; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
-; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
-; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
-; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
-; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
-; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
-; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
-; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
-; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
-; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
-; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
-; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
-; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
-; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
+; GFX10-NEXT: v_add_f32_e32 v21, v30, v34
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_add_f32_e32 v20, v29, v36
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
+; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
+; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
+; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
+; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
+; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
+; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
+; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
+; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
+; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
+; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
+; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
+; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
+; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
+; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
+; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
+; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
-; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
-; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
-; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
+; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
+; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
+; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
+; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
+; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
-; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
+; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
+; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
+; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
+; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
+; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
+; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
+; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
+; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
+; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
+; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
+; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
+; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
+; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
+; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
-; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
-; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
+; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
+; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
+; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
+; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
+; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
+; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
+; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
+; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
-; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
-; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
-; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
-; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
-; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
-; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
-; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
+; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
+; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
+; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
+; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
+; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
+; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
+; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
+; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
+; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
+; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
+; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
+; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
+; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
+; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
+; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
+; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; GFX10-NEXT: v_add_f32_e32 v17, v31, v17
; GFX10-NEXT: v_add_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
+; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
-; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
-; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
-; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
+; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -13107,7 +13109,7 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fsub_v3bf16:
@@ -13776,7 +13778,7 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmul_v3bf16:
@@ -14494,47 +14496,55 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_mul_f32_e32 v11, v11, v27
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_mul_f32_e32 v10, v10, v26
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_mul_f32_e32 v9, v9, v25
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_mul_f32_e32 v8, v8, v24
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_mul_f32_e32 v7, v7, v23
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_mul_f32_e32 v6, v6, v22
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_mul_f32_e32 v5, v5, v21
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_mul_f32_e32 v4, v4, v20
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
@@ -14543,14 +14553,6 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_mul_f32_e32 v10, v10, v26
-; GCN-NEXT: v_mul_f32_e32 v9, v9, v25
-; GCN-NEXT: v_mul_f32_e32 v8, v8, v24
-; GCN-NEXT: v_mul_f32_e32 v7, v7, v23
-; GCN-NEXT: v_mul_f32_e32 v6, v6, v22
-; GCN-NEXT: v_mul_f32_e32 v5, v5, v21
-; GCN-NEXT: v_mul_f32_e32 v4, v4, v20
; GCN-NEXT: v_mul_f32_e32 v3, v3, v19
; GCN-NEXT: v_mul_f32_e32 v2, v2, v18
; GCN-NEXT: v_mul_f32_e32 v1, v1, v17
@@ -14570,7 +14572,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_mul_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
@@ -14580,22 +14582,20 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_fmul_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27
-; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
@@ -14606,24 +14606,25 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
@@ -14648,6 +14649,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v14, v14, v30
; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29
; GFX7-NEXT: v_mul_f32_e32 v12, v12, v28
+; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27
; GFX7-NEXT: v_mul_f32_e32 v10, v10, v26
; GFX7-NEXT: v_mul_f32_e32 v9, v9, v25
; GFX7-NEXT: v_mul_f32_e32 v8, v8, v24
@@ -14666,7 +14668,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_mul_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
@@ -16124,10 +16126,10 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
@@ -16430,278 +16432,278 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_mul_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
-; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
-; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
-; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
-; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
-; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
-; GFX10-NEXT: v_mul_f32_e32 v27, v50, v27
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
-; GFX10-NEXT: v_mul_f32_e32 v29, v38, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39
+; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49
+; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT: v_mul_f32_e32 v12, v12, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
-; GFX10-NEXT: v_mul_f32_e32 v28, v48, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
-; GFX10-NEXT: v_mul_f32_e32 v34, v34, v51
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_mul_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
-; GFX10-NEXT: v_mul_f32_e32 v30, v36, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
+; GFX10-NEXT: v_mul_f32_e32 v25, v54, v53
+; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24
+; GFX10-NEXT: v_mul_f32_e32 v24, v64, v55
+; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23
+; GFX10-NEXT: v_mul_f32_e32 v23, v66, v65
+; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22
+; GFX10-NEXT: v_mul_f32_e32 v22, v68, v67
+; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
+; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
+; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
+; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_mul_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_mul_f32_e32 v18, v48, v23
+; GFX10-NEXT: v_mul_f32_e32 v18, v27, v48
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_mul_f32_e32 v17, v50, v22
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
-; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT: v_mul_f32_e32 v17, v26, v50
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
+; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
+; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
+; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
+; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
+; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_mul_f32_e32 v20, v36, v25
+; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_mul_f32_e32 v19, v38, v24
-; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
-; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
-; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_mul_f32_e32 v19, v28, v38
+; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
+; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
+; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
+; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
+; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
+; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
+; GFX10-NEXT: v_mul_f32_e32 v51, v52, v51
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_mul_f32_e32 v21, v51, v26
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
-; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
-; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
-; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
-; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
-; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
-; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
-; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
-; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
-; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
-; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
-; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
-; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
-; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
-; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
-; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
-; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
-; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
-; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
-; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
-; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
-; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
-; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
-; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
-; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
-; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
-; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
-; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
+; GFX10-NEXT: v_mul_f32_e32 v21, v30, v34
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_mul_f32_e32 v20, v29, v36
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
+; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
+; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
+; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
+; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
+; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
+; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
+; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
+; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
+; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
+; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
+; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
+; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
+; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
+; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
+; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
+; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
-; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
-; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
-; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
+; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
+; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
+; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
+; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
+; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
-; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
+; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
+; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
+; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
+; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
+; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
+; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
+; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
+; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
+; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
+; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
+; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
+; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
+; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
+; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
-; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
-; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
+; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
+; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
+; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
+; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
+; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
+; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
+; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
+; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
-; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
-; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
-; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
-; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
-; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
-; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
-; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
+; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
+; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
+; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
+; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
+; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
+; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
+; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
+; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
+; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
+; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
+; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
+; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
+; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
+; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
+; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
+; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; GFX10-NEXT: v_mul_f32_e32 v17, v31, v17
; GFX10-NEXT: v_mul_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
+; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
-; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
-; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
-; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
+; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -17854,7 +17856,7 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_minnum_v3bf16:
@@ -18572,47 +18574,55 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_min_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_min_f32_e32 v11, v11, v27
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_min_f32_e32 v10, v10, v26
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_min_f32_e32 v9, v9, v25
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_min_f32_e32 v8, v8, v24
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_min_f32_e32 v7, v7, v23
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_min_f32_e32 v6, v6, v22
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_min_f32_e32 v5, v5, v21
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_min_f32_e32 v4, v4, v20
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
@@ -18621,14 +18631,6 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_min_f32_e32 v10, v10, v26
-; GCN-NEXT: v_min_f32_e32 v9, v9, v25
-; GCN-NEXT: v_min_f32_e32 v8, v8, v24
-; GCN-NEXT: v_min_f32_e32 v7, v7, v23
-; GCN-NEXT: v_min_f32_e32 v6, v6, v22
-; GCN-NEXT: v_min_f32_e32 v5, v5, v21
-; GCN-NEXT: v_min_f32_e32 v4, v4, v20
; GCN-NEXT: v_min_f32_e32 v3, v3, v19
; GCN-NEXT: v_min_f32_e32 v2, v2, v18
; GCN-NEXT: v_min_f32_e32 v1, v1, v17
@@ -18648,7 +18650,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_min_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
@@ -18658,22 +18660,20 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_minnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
-; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
@@ -18684,24 +18684,25 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
@@ -18726,6 +18727,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_min_f32_e32 v14, v14, v30
; GFX7-NEXT: v_min_f32_e32 v13, v13, v29
; GFX7-NEXT: v_min_f32_e32 v12, v12, v28
+; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
; GFX7-NEXT: v_min_f32_e32 v10, v10, v26
; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
; GFX7-NEXT: v_min_f32_e32 v8, v8, v24
@@ -18744,7 +18746,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_min_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
@@ -20202,10 +20204,10 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
@@ -20508,278 +20510,278 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_min_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
-; GFX10-NEXT: v_min_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_min_f32_e32 v11, v11, v27
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
-; GFX10-NEXT: v_min_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
-; GFX10-NEXT: v_min_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_min_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
-; GFX10-NEXT: v_min_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_min_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
-; GFX10-NEXT: v_min_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_min_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
-; GFX10-NEXT: v_min_f32_e32 v27, v50, v27
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_min_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
-; GFX10-NEXT: v_min_f32_e32 v29, v38, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT: v_min_f32_e32 v39, v48, v39
+; GFX10-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_min_f32_e32 v49, v50, v49
+; GFX10-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT: v_min_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT: v_min_f32_e32 v12, v12, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_min_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
-; GFX10-NEXT: v_min_f32_e32 v28, v48, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_min_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_min_f32_e32 v10, v10, v26
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
-; GFX10-NEXT: v_min_f32_e32 v34, v34, v51
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_min_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
-; GFX10-NEXT: v_min_f32_e32 v30, v36, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
+; GFX10-NEXT: v_min_f32_e32 v25, v54, v53
+; GFX10-NEXT: v_min_f32_e32 v8, v8, v24
+; GFX10-NEXT: v_min_f32_e32 v24, v64, v55
+; GFX10-NEXT: v_min_f32_e32 v7, v7, v23
+; GFX10-NEXT: v_min_f32_e32 v23, v66, v65
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX10-NEXT: v_min_f32_e32 v22, v68, v67
+; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
+; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
+; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
+; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT: v_min_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_min_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_min_f32_e32 v18, v48, v23
+; GFX10-NEXT: v_min_f32_e32 v18, v27, v48
; GFX10-NEXT: v_min_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_min_f32_e32 v17, v50, v22
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
-; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT: v_min_f32_e32 v17, v26, v50
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
+; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
+; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
+; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
+; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
+; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_min_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_min_f32_e32 v20, v36, v25
+; GFX10-NEXT: v_min_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT: v_min_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_min_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_min_f32_e32 v19, v38, v24
-; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
-; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
-; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_min_f32_e32 v19, v28, v38
+; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
+; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
+; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
+; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
+; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
+; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
+; GFX10-NEXT: v_min_f32_e32 v51, v52, v51
; GFX10-NEXT: v_min_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_min_f32_e32 v21, v51, v26
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
-; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
-; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
-; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
-; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
-; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
-; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
-; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
-; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
-; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
-; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
-; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
-; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
-; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
-; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
-; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
-; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
-; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
-; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
-; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
-; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
-; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
-; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
-; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
-; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
-; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
-; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
-; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
+; GFX10-NEXT: v_min_f32_e32 v21, v30, v34
+; GFX10-NEXT: v_min_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_min_f32_e32 v20, v29, v36
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
+; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
+; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
+; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
+; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
+; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
+; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
+; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
+; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
+; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
+; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
+; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
+; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
+; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
+; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
+; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
+; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
-; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
-; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
-; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
+; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
+; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
+; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
+; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
+; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
-; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
+; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
+; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
+; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
+; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
+; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
+; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
+; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
+; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
+; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
+; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
+; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
+; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
+; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
+; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
-; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
-; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
+; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
+; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
+; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
+; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
+; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
+; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
+; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
+; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
-; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
-; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
-; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
-; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
-; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
-; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
-; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
+; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
+; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
+; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
+; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
+; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
+; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
+; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
+; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
+; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
+; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
+; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
+; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
+; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
+; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
+; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
+; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; GFX10-NEXT: v_min_f32_e32 v17, v31, v17
; GFX10-NEXT: v_min_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
+; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
-; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
-; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
-; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
+; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -21473,7 +21475,7 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_maxnum_v3bf16:
@@ -22191,47 +22193,55 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_max_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_max_f32_e32 v11, v11, v27
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_max_f32_e32 v10, v10, v26
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_max_f32_e32 v9, v9, v25
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_max_f32_e32 v8, v8, v24
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_max_f32_e32 v7, v7, v23
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_max_f32_e32 v6, v6, v22
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_max_f32_e32 v5, v5, v21
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_max_f32_e32 v4, v4, v20
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
@@ -22240,14 +22250,6 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_max_f32_e32 v10, v10, v26
-; GCN-NEXT: v_max_f32_e32 v9, v9, v25
-; GCN-NEXT: v_max_f32_e32 v8, v8, v24
-; GCN-NEXT: v_max_f32_e32 v7, v7, v23
-; GCN-NEXT: v_max_f32_e32 v6, v6, v22
-; GCN-NEXT: v_max_f32_e32 v5, v5, v21
-; GCN-NEXT: v_max_f32_e32 v4, v4, v20
; GCN-NEXT: v_max_f32_e32 v3, v3, v19
; GCN-NEXT: v_max_f32_e32 v2, v2, v18
; GCN-NEXT: v_max_f32_e32 v1, v1, v17
@@ -22267,7 +22269,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_max_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
@@ -22277,22 +22279,20 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_maxnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
-; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
@@ -22303,24 +22303,25 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
@@ -22345,6 +22346,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_max_f32_e32 v14, v14, v30
; GFX7-NEXT: v_max_f32_e32 v13, v13, v29
; GFX7-NEXT: v_max_f32_e32 v12, v12, v28
+; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
; GFX7-NEXT: v_max_f32_e32 v10, v10, v26
; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
; GFX7-NEXT: v_max_f32_e32 v8, v8, v24
@@ -22363,7 +22365,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_max_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
@@ -23821,10 +23823,10 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
@@ -24127,278 +24129,278 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
-; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_max_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
-; GFX10-NEXT: v_max_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_max_f32_e32 v11, v11, v27
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
-; GFX10-NEXT: v_max_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
-; GFX10-NEXT: v_max_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_max_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
-; GFX10-NEXT: v_max_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_max_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
-; GFX10-NEXT: v_max_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_max_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
-; GFX10-NEXT: v_max_f32_e32 v27, v50, v27
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_max_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
-; GFX10-NEXT: v_max_f32_e32 v29, v38, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT: v_max_f32_e32 v39, v48, v39
+; GFX10-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_max_f32_e32 v49, v50, v49
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT: v_max_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT: v_max_f32_e32 v12, v12, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_max_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
-; GFX10-NEXT: v_max_f32_e32 v28, v48, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_max_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_max_f32_e32 v10, v10, v26
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
-; GFX10-NEXT: v_max_f32_e32 v34, v34, v51
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_max_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
-; GFX10-NEXT: v_max_f32_e32 v30, v36, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
+; GFX10-NEXT: v_max_f32_e32 v25, v54, v53
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v24
+; GFX10-NEXT: v_max_f32_e32 v24, v64, v55
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v23
+; GFX10-NEXT: v_max_f32_e32 v23, v66, v65
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX10-NEXT: v_max_f32_e32 v22, v68, v67
+; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
+; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
+; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
+; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT: v_max_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_max_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_max_f32_e32 v18, v48, v23
+; GFX10-NEXT: v_max_f32_e32 v18, v27, v48
; GFX10-NEXT: v_max_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_max_f32_e32 v17, v50, v22
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
-; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT: v_max_f32_e32 v17, v26, v50
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
+; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
+; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
+; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
+; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
+; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_max_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_max_f32_e32 v20, v36, v25
+; GFX10-NEXT: v_max_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT: v_max_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_max_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_max_f32_e32 v19, v38, v24
-; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
-; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
-; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_max_f32_e32 v19, v28, v38
+; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
+; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
+; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
+; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
+; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
+; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v51, v52, v51
; GFX10-NEXT: v_max_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_max_f32_e32 v21, v51, v26
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
-; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
-; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
-; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
-; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
-; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
-; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
-; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
-; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
-; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
-; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
-; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
-; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
-; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
-; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
-; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
-; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
-; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
-; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
-; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
-; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
-; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
-; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
-; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
-; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
-; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
-; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
-; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
+; GFX10-NEXT: v_max_f32_e32 v21, v30, v34
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_max_f32_e32 v20, v29, v36
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
+; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
+; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
+; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
+; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
+; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
+; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
+; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
+; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
+; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
+; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
+; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
+; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
+; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
+; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
+; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
+; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
+; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
-; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
-; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
-; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
+; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
+; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
+; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
+; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
+; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
-; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
+; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
+; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
+; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
+; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
+; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
+; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
+; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
+; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
+; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
+; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
+; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
+; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
+; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
+; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
+; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
-; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
-; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
+; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
+; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
+; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
+; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
+; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
+; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
+; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
+; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
+; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
+; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
-; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
-; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
-; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
-; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
-; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
-; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
-; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
-; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
+; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
+; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
+; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
+; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
+; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
+; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
+; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
+; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
+; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
+; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
+; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
+; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
+; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
+; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
+; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
+; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; GFX10-NEXT: v_max_f32_e32 v17, v31, v17
; GFX10-NEXT: v_max_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
+; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
-; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
-; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
-; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
+; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -25052,26 +25054,26 @@ define bfloat @v_log_bf16(bfloat %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x800000
+; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GCN-NEXT: s_mov_b32 s5, 0x7f800000
-; GCN-NEXT: v_mov_b32_e32 v1, 0x41b17218
+; GCN-NEXT: v_mov_b32_e32 v2, 0x41b17218
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2
-; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
; GCN-NEXT: v_log_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v2, 0xfffff000, v0
-; GCN-NEXT: v_sub_f32_e32 v3, v0, v2
-; GCN-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
+; GCN-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
+; GCN-NEXT: v_sub_f32_e32 v3, v0, v1
+; GCN-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1
+; GCN-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
; GCN-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3
; GCN-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3
; GCN-NEXT: v_add_f32_e32 v3, v4, v3
; GCN-NEXT: v_add_f32_e32 v3, v5, v3
-; GCN-NEXT: v_add_f32_e32 v2, v2, v3
+; GCN-NEXT: v_add_f32_e32 v1, v1, v3
; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s5
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -25082,10 +25084,10 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_mov_b32 s4, 0x800000
+; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX7-NEXT: v_log_f32_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s4, 0x3f317217
; GFX7-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
@@ -25107,10 +25109,10 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_mov_b32 s4, 0x800000
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: s_mov_b32 s4, 0x7f800000
; GFX8-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
@@ -25141,10 +25143,10 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_mov_b32 s4, 0x800000
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: s_mov_b32 s4, 0x3f317217
; GFX9-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
@@ -25172,9 +25174,8 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX10-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
@@ -25198,30 +25199,28 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_f32_e32 v1, v1, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo
; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.log.bf16(bfloat %a)
@@ -25234,14 +25233,14 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x800000
-; GCN-NEXT: v_mov_b32_e32 v1, 0x42000000
+; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000
+; GCN-NEXT: v_mov_b32_e32 v2, 0x42000000
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2
-; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
; GCN-NEXT: v_log_f32_e32 v0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -25252,10 +25251,10 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_mov_b32 s4, 0x800000
+; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX7-NEXT: v_log_f32_e32 v0, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -25268,10 +25267,10 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_mov_b32 s4, 0x800000
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -25291,9 +25290,9 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_mov_b32 s4, 0x800000
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f800000
+; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -25312,10 +25311,9 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2
-; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
@@ -25332,21 +25330,20 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.log2.bf16(bfloat %a)
@@ -25359,26 +25356,26 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x800000
+; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GCN-NEXT: s_mov_b32 s5, 0x7f800000
-; GCN-NEXT: v_mov_b32_e32 v1, 0x411a209b
+; GCN-NEXT: v_mov_b32_e32 v2, 0x411a209b
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2
-; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
; GCN-NEXT: v_log_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v2, 0xfffff000, v0
-; GCN-NEXT: v_sub_f32_e32 v3, v0, v2
-; GCN-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
+; GCN-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
+; GCN-NEXT: v_sub_f32_e32 v3, v0, v1
+; GCN-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1
+; GCN-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
; GCN-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3
; GCN-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3
; GCN-NEXT: v_add_f32_e32 v3, v4, v3
; GCN-NEXT: v_add_f32_e32 v3, v5, v3
-; GCN-NEXT: v_add_f32_e32 v2, v2, v3
+; GCN-NEXT: v_add_f32_e32 v1, v1, v3
; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s5
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -25389,10 +25386,10 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_mov_b32 s4, 0x800000
+; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX7-NEXT: v_log_f32_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s4, 0x3e9a209a
; GFX7-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -25414,10 +25411,10 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_mov_b32 s4, 0x800000
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: s_mov_b32 s4, 0x7f800000
; GFX8-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
@@ -25448,10 +25445,10 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_mov_b32 s4, 0x800000
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: s_mov_b32 s4, 0x3e9a209a
; GFX9-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -25479,9 +25476,8 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX10-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
@@ -25505,30 +25501,28 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_f32_e32 v1, v1, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo
; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.log10.bf16(bfloat %a)
@@ -25725,14 +25719,14 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0xc2fc0000
; GCN-NEXT: v_mov_b32_e32 v1, 0x42800000
-; GCN-NEXT: v_not_b32_e32 v2, 63
+; GCN-NEXT: v_mov_b32_e32 v2, 0x1f800000
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
; GCN-NEXT: v_exp_f32_e32 v0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -25747,9 +25741,9 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
; GFX7-NEXT: v_exp_f32_e32 v0, v0
-; GFX7-NEXT: v_not_b32_e32 v1, 63
-; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, 0x1f800000
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -25763,9 +25757,9 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_not_b32_e32 v1, 63
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
@@ -25785,10 +25779,10 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_not_b32_e32 v1, 63
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
@@ -25803,10 +25797,10 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -25822,12 +25816,12 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_exp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -28676,8 +28670,8 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16:
@@ -28780,9 +28774,9 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16:
@@ -28903,18 +28897,18 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16:
@@ -30486,8 +30480,8 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_v3i16_to_v3bf16:
@@ -31007,7 +31001,7 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v2, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_v3i32_to_v3bf16:
@@ -31927,7 +31921,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_v3i64_to_v3bf16:
@@ -32761,7 +32755,7 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -33284,7 +33278,7 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v2, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_v3i32_to_v3bf16:
@@ -34053,8 +34047,8 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_v3i64_to_v3bf16:
@@ -34506,25 +34500,14 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11TRUE16-LABEL: v_select_bf16:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: v_select_bf16:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_select_bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, bfloat %a, bfloat %b
ret bfloat %op
}
@@ -34582,14 +34565,11 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v1.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
@@ -34659,14 +34639,11 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT: v_xor_b16 v2.l, 0x8000, v2.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
@@ -34764,17 +34741,13 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v3.l, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_cndmask_b32 v0, v2, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_select_v2bf16:
@@ -34875,21 +34848,17 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
; GFX11TRUE16-LABEL: v_vselect_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v4.l, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v2.l, s0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v2bf16:
@@ -34960,27 +34929,16 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
-; GFX11TRUE16-LABEL: s_select_bf16:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11TRUE16-NEXT: ; return to shader part epilog
-;
-; GFX11FAKE16-LABEL: s_select_bf16:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: v_mov_b32_e32 v1, s0
-; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11FAKE16-NEXT: ; return to shader part epilog
+; GFX11-LABEL: s_select_bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
%cond = icmp eq i32 %c, 0
%op = select i1 %cond, bfloat %a, bfloat %b
%cast = bitcast bfloat %op to i16
@@ -35073,22 +35031,18 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
;
; GFX11TRUE16-LABEL: s_select_v2bf16:
; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s2, s1, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s3
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_cndmask_b32 v1, v3, v4
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v1
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_select_v2bf16:
@@ -35195,22 +35149,19 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
;
; GFX11TRUE16-LABEL: s_vselect_v2bf16:
; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, s2
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, s2
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s3
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
@@ -35699,81 +35650,81 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GCN-LABEL: v_select_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_alignbit_b32 v2, v2, v17, 16
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v23
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v25
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v30
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_alignbit_b32 v4, v4, v17, 16
-; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32
-; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:4
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16
-; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16
-; GCN-NEXT: v_alignbit_b32 v8, v20, v21, 16
-; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16
-; GCN-NEXT: v_alignbit_b32 v10, v22, v23, 16
-; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; GCN-NEXT: v_alignbit_b32 v12, v24, v25, 16
+; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16
; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; GCN-NEXT: v_alignbit_b32 v14, v26, v27, 16
+; GCN-NEXT: v_alignbit_b32 v14, v19, v20, 16
; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v12
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -35806,67 +35757,67 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v18
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v20
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v21
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v21
; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v24
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_alignbit_b32 v18, v18, v19, 16
+; GFX7-NEXT: v_alignbit_b32 v6, v6, v17, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v23
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v23
; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v26
+; GFX7-NEXT: v_alignbit_b32 v8, v8, v17, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v25
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_alignbit_b32 v10, v10, v17, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_alignbit_b32 v8, v8, v19, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v25
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v28
; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v28
+; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v27
+; GFX7-NEXT: v_alignbit_b32 v17, v17, v18, 16
+; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_alignbit_b32 v10, v10, v19, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v27
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30
-; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
-; GFX7-NEXT: v_alignbit_b32 v12, v12, v19, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v29
+; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_alignbit_b32 v14, v14, v19, 16
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
@@ -35875,21 +35826,21 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v17
-; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16
-; GFX7-NEXT: v_cndmask_b32_e32 v15, v6, v15, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v18
+; GFX7-NEXT: v_alignbit_b32 v12, v12, v16, 16
+; GFX7-NEXT: v_cndmask_b32_e32 v15, v12, v15, vcc
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -36918,38 +36869,32 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
;
; GFX11TRUE16-LABEL: s_vselect_v4bf16:
; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_lshr_b32 s7, s3, 16
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, s3
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, s1
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX11TRUE16-NEXT: s_lshr_b32 s4, s3, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s5, s1, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s1, s2, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, s1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, s3
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, s2
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, s0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s4
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, s5
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1
-; GFX11TRUE16-NEXT: s_lshr_b32 s8, s1, 16
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s7, s0, 16
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s8
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s7
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, s2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, s1
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, s6
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v2.l, s4
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v2.h, v3.l, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.h, s5
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
-; GFX11TRUE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_vselect_v4bf16:
@@ -37125,33 +37070,28 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
; GFX11TRUE16-LABEL: v_vselect_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v5
+; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v6
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v4
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v7
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v2
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v0
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v8.l, v3.l, s1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v5.l, s2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v4bf16:
@@ -37185,30 +37125,30 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
; GCN-LABEL: v_vselect_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v7, 1, v7
-; GCN-NEXT: v_and_b32_e32 v6, 1, v6
-; GCN-NEXT: v_and_b32_e32 v5, 1, v5
-; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_and_b32_e32 v3, 1, v3
-; GCN-NEXT: v_and_b32_e32 v2, 1, v2
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_and_b32_e32 v2, 1, v2
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_and_b32_e32 v3, 1, v3
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_and_b32_e32 v4, 1, v4
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_and_b32_e32 v5, 1, v5
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_and_b32_e32 v6, 1, v6
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_and_b32_e32 v7, 1, v7
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GCN-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
@@ -37239,45 +37179,45 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX7-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
+; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX7-NEXT: v_cndmask_b32_e32 v6, v15, v14, vcc
+; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v21
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v14, v13, vcc
+; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v13, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v13, v12, vcc
+; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v12, v11, vcc
; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v19, v11, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v17
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v12, v9, vcc
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v11, v8, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
@@ -37419,51 +37359,52 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
; GFX11TRUE16-LABEL: v_vselect_v8bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v15
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v11
+; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v14
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v10
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v18.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v19.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v9
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v16, v17, vcc_lo
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v12
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v8
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v11, v15, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v13.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v16.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v17.l
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v7
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v6
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v5
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v15
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v1
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v11
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v2
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v3
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, s2
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v12
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v9
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v13
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v10
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v14
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v15.l, v11.l, s3
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v14.l, v10.l, s4
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v3.l, v2.l, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v12.l, v8.l, s0
-; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v5.l, v4.l, s1
-; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v13.l, v9.l, s5
-; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v7.l, v6.l, s6
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v4, v5, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v1, v2, v6, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v2, v3, v7, 0x5040100
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_perm_b32 v3, v8, v9, 0x5040100
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v7.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v13, v15, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v6.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v11, v9, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v5.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v8bf16:
@@ -37546,16 +37487,16 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v1, 1, v10
; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v1
-; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17
-; GCN-NEXT: v_and_b32_e32 v2, 1, v11
-; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v2
-; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18
-; GCN-NEXT: v_and_b32_e32 v3, 1, v12
-; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v3
+; GCN-NEXT: v_and_b32_e32 v3, 1, v11
+; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v3
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v18
+; GCN-NEXT: v_and_b32_e32 v5, 1, v12
+; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v5
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v19
; GCN-NEXT: v_and_b32_e32 v7, 1, v13
; GCN-NEXT: v_and_b32_e32 v8, 1, v14
; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v7
@@ -37622,22 +37563,22 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[12:13]
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[10:11]
-; GCN-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[8:9]
-; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[8:9]
+; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v17
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
@@ -37663,136 +37604,151 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX7-LABEL: v_vselect_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v8
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v7
-; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32
-; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v15
-; GFX7-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v14
-; GFX7-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v13
-; GFX7-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v12
-; GFX7-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11
-; GFX7-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX7-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v9
-; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX7-NEXT: s_mov_b64 exec, s[4:5]
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v1
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v2
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v3
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v4
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v5
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v6
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v7
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v8
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v9
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v10
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 1, v11
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
+; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64
+; GFX7-NEXT: v_and_b32_e32 v2, 1, v12
+; GFX7-NEXT: v_writelane_b32 v31, s30, 0
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2
+; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_and_b32_e32 v3, 1, v13
+; GFX7-NEXT: v_writelane_b32 v31, s31, 1
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3
+; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_and_b32_e32 v4, 1, v14
+; GFX7-NEXT: v_writelane_b32 v31, s34, 2
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4
+; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_and_b32_e32 v5, 1, v15
+; GFX7-NEXT: v_writelane_b32 v31, s35, 3
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v5
+; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[12:13]
-; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v30
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_cndmask_b32_e64 v14, v8, v7, s[10:11]
-; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v29
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_cndmask_b32_e64 v13, v8, v7, s[8:9]
-; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v28
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_cndmask_b32_e64 v12, v8, v7, s[6:7]
-; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v27
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_cndmask_b32_e64 v11, v8, v7, s[4:5]
-; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v26
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v15, v1, v0, s[34:35]
+; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_cndmask_b32_e64 v14, v2, v1, s[30:31]
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v29
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_cndmask_b32_e64 v13, v3, v2, s[28:29]
+; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v28
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_cndmask_b32_e64 v12, v4, v3, s[26:27]
+; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v27
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_cndmask_b32_e64 v11, v5, v4, s[24:25]
+; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v26
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_cndmask_b32_e32 v10, v8, v7, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28
-; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_readlane_b32 s35, v31, 3
+; GFX7-NEXT: v_readlane_b32 s34, v31, 2
+; GFX7-NEXT: v_readlane_b32 s31, v31, 1
+; GFX7-NEXT: v_readlane_b32 s30, v31, 0
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v10, v0, v5, s[22:23]
+; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v25
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v9, v1, v5, s[20:21]
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v24
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_cndmask_b32_e64 v8, v2, v5, s[18:19]
+; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v23
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_cndmask_b32_e64 v7, v3, v5, s[16:17]
+; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v22
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_cndmask_b32_e64 v6, v4, v5, s[14:15]
+; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v21
-; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_cndmask_b32_e64 v9, v8, v7, s[18:19]
-; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v24
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[16:17]
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v23
-; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[12:13]
+; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
-; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[14:15]
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc
-; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v17, s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v18, s[6:7]
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v19, s[8:9]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v18, v16, vcc
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v20, v0, v20, s[10:11]
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v16, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v19
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
+; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_vselect_v16bf16:
@@ -37823,51 +37779,53 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v10
; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v11
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v12
+; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX8-NEXT: v_writelane_b32 v31, s30, 0
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v13
+; GFX8-NEXT: v_and_b32_e32 v2, 1, v12
+; GFX8-NEXT: v_and_b32_e32 v3, 1, v13
; GFX8-NEXT: v_writelane_b32 v31, s31, 1
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v14
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v22
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v30
; GFX8-NEXT: v_writelane_b32 v31, s34, 2
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v15
+; GFX8-NEXT: v_and_b32_e32 v1, 1, v11
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v14
+; GFX8-NEXT: v_and_b32_e32 v5, 1, v15
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v3, v2, s[28:29]
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v28
; GFX8-NEXT: v_writelane_b32 v31, s35, 3
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v22
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v30
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v0, s[28:29]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v21
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v29
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v1, v0, s[24:25]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v20
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v28
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, v0, s[20:21]
-; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v23
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v5
+; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v2, s[20:21]
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v21
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v29
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[24:25]
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v27
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v4, s[16:17]
; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v24
; GFX8-NEXT: v_cndmask_b32_e64 v7, v30, v22, s[26:27]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v27, v19, s[14:15]
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[22:23]
-; GFX8-NEXT: v_cndmask_b32_e64 v9, v28, v20, s[18:19]
-; GFX8-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15]
-; GFX8-NEXT: v_cndmask_b32_e64 v13, v26, v18, s[10:11]
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v28, v20, s[18:19]
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v26, v18, s[10:11]
; GFX8-NEXT: v_cndmask_b32_e64 v14, v25, v17, s[6:7]
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v10
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e64 v10, v0, v23, s[30:31]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v11, v0, v1, s[34:35]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v19
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v27
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[16:17]
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v12, v0, v23, s[30:31]
+; GFX8-NEXT: v_cndmask_b32_e64 v13, v2, v1, s[34:35]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v18
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v26
; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[12:13]
@@ -37880,13 +37838,11 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v13
; GFX8-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v7, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_readlane_b32 s35, v31, 3
; GFX8-NEXT: v_readlane_b32 s34, v31, 2
; GFX8-NEXT: v_readlane_b32 s31, v31, 1
@@ -37900,81 +37856,81 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX9-LABEL: v_vselect_v16bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 1, v14
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 1, v15
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v4
+; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32
; GFX9-NEXT: v_and_b32_e32 v12, 1, v12
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
-; GFX9-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
+; GFX9-NEXT: v_and_b32_e32 v12, 1, v13
; GFX9-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v22, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX9-NEXT: v_and_b32_e32 v10, 1, v11
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v29, v21, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v29
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v22, v21, vcc
-; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32
-; GFX9-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20
-; GFX9-NEXT: v_cndmask_b32_e32 v20, v28, v20, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v28
; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
-; GFX9-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v19
-; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v27
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX9-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v22, v9, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26
-; GFX9-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v27, v6, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v10
+; GFX9-NEXT: v_and_b32_e32 v10, 1, v11
+; GFX9-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v6
+; GFX9-NEXT: v_and_b32_e32 v6, 1, v7
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v30
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v10
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v8
+; GFX9-NEXT: v_and_b32_e32 v8, 1, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[4:5]
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v29
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[8:9]
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v28
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[12:13]
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v27
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v23
+; GFX9-NEXT: v_cndmask_b32_e64 v11, v13, v11, s[16:17]
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v23
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v30, v22, vcc
; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v15, v26, v18, s[18:19]
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v10, v28, v20, s[10:11]
+; GFX9-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15]
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v5, v5, v8, s4
+; GFX9-NEXT: v_perm_b32 v6, v7, v6, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v14, v21, v23, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v21
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v15, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v14, v4, v23, s[20:21]
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v13, v4, v13, s[22:23]
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v26
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[24:25]
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v25
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v17, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v24
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v15, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
-; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4
-; GFX9-NEXT: v_perm_b32 v3, v9, v19, s4
-; GFX9-NEXT: v_perm_b32 v4, v8, v20, s4
-; GFX9-NEXT: v_perm_b32 v5, v10, v11, s4
-; GFX9-NEXT: v_perm_b32 v6, v13, v12, s4
-; GFX9-NEXT: v_perm_b32 v7, v7, v14, s4
+; GFX9-NEXT: v_perm_b32 v2, v4, v15, s4
+; GFX9-NEXT: v_perm_b32 v3, v11, v12, s4
+; GFX9-NEXT: v_perm_b32 v4, v9, v10, s4
+; GFX9-NEXT: v_perm_b32 v7, v13, v14, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v16bf16:
@@ -37991,13 +37947,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v21
; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v29
-; GFX10-NEXT: v_cndmask_b32_e32 v22, v30, v22, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v20
; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX10-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
@@ -38006,13 +37962,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX10-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v25
+; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v25
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v16
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v24
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v18
; GFX10-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo
@@ -38031,11 +37987,11 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX10-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v12, v51, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v52, v51, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v13, v30, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v54, v53, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo
@@ -38048,108 +38004,113 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v31
-; GFX10-NEXT: v_cndmask_b32_e32 v12, v31, v23, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX10-NEXT: v_cndmask_b32_e32 v13, v3, v32, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v3, v32, vcc_lo
; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
-; GFX10-NEXT: v_perm_b32 v6, v33, v22, 0x5040100
-; GFX10-NEXT: v_perm_b32 v7, v13, v12, 0x5040100
+; GFX10-NEXT: v_perm_b32 v6, v13, v12, 0x5040100
+; GFX10-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_vselect_v16bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v24
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6
-; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v28.l, v20.l, s8
-; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v38.l, v37.l, s7
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v30
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v22.l, v36.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v30.l, v35.l
+; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.l
+; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v21.l, v38.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v29.l, v37.l
+; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v26
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v25
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v12
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v10
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14
-; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v27.l, v19.l, s6
-; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v48.l, v39.l, s5
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v54.l, v53.l, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v24.l, v16.l, s0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v2.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v30.l, v22.l, s10
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v34.l, v33.l, s11
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v29.l, v21.l, s12
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v36.l, v35.l, s9
-; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v52.l, v51.l, s1
-; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v25.l, v17.l, s2
-; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v50.l, v49.l, s3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v26.l, v18.l, s4
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v1.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v0.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v7, v8, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v1, v5, v9, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v5, v14, v15, 0x5040100
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v19.l, v50.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v27.l, v49.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v24
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v26.l, v51.l
+; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v54.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v25.l, v53.l
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.h, v13.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v22, v30, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v21, v29, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.h, v9.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v20, v28, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.h, v11.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v16, v19, v27, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v7.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v6.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v12.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v18, v26, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v18.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v17, v25, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v16.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v17.l
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v31
-; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v31.l, v23.l, s14
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v2.l, v32.l, s13
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.l
-; GFX11TRUE16-NEXT: v_perm_b32 v2, v6, v4, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v4, v12, v13, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v6, v16, v17, 0x5040100
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h
-; GFX11TRUE16-NEXT: v_perm_b32 v3, v10, v11, 0x5040100
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_perm_b32 v7, v8, v7, 0x5040100
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.l
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v31
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v10, v23, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v32, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.h, v8.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v16bf16:
@@ -39439,206 +39400,219 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-LABEL: v_vselect_v32bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_clause 0xa
-; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
-; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
-; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104
-; GFX10-NEXT: buffer_load_ushort v35, off, s[0:3], s32
-; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128
-; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64
-; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96
-; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108
-; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44
-; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112
+; GFX10-NEXT: s_or_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: v_and_b32_e32 v29, 1, v29
; GFX10-NEXT: v_and_b32_e32 v30, 1, v30
-; GFX10-NEXT: v_and_b32_e32 v18, 1, v18
-; GFX10-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX10-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX10-NEXT: v_and_b32_e32 v19, 1, v19
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30
-; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v18
; GFX10-NEXT: v_and_b32_e32 v28, 1, v28
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v13
-; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v19
; GFX10-NEXT: v_and_b32_e32 v26, 1, v26
; GFX10-NEXT: v_and_b32_e32 v24, 1, v24
; GFX10-NEXT: v_and_b32_e32 v22, 1, v22
; GFX10-NEXT: v_and_b32_e32 v20, 1, v20
-; GFX10-NEXT: v_and_b32_e32 v21, 1, v21
+; GFX10-NEXT: v_and_b32_e32 v18, 1, v18
; GFX10-NEXT: v_and_b32_e32 v16, 1, v16
; GFX10-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX10-NEXT: v_and_b32_e32 v17, 1, v17
-; GFX10-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX10-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX10-NEXT: s_clause 0x14
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX10-NEXT: buffer_load_ushort v33, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128
+; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48
+; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116
+; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52
+; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120
+; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56
+; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32
+; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100
+; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36
+; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104
+; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40
+; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108
+; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44
+; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:112
+; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:72
+; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:76
+; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:80
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92
+; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:28
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v30
+; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96
+; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v28
+; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88
+; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v26
+; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 1, v24
+; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84
+; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 1, v22
+; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20
+; GFX10-NEXT: v_cmp_eq_u32_e64 s9, 1, v20
+; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
+; GFX10-NEXT: v_cmp_eq_u32_e64 s10, 1, v18
+; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
+; GFX10-NEXT: v_cmp_eq_u32_e64 s11, 1, v16
+; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
+; GFX10-NEXT: v_cmp_eq_u32_e64 s12, 1, v14
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68
+; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:24
+; GFX10-NEXT: v_cmp_eq_u32_e64 s13, 1, v12
+; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX10-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX10-NEXT: s_waitcnt vmcnt(10)
-; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v31
+; GFX10-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX10-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX10-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX10-NEXT: v_and_b32_e32 v17, 1, v17
+; GFX10-NEXT: v_and_b32_e32 v19, 1, v19
+; GFX10-NEXT: v_and_b32_e32 v21, 1, v21
+; GFX10-NEXT: v_and_b32_e32 v23, 1, v23
+; GFX10-NEXT: v_and_b32_e32 v25, 1, v25
+; GFX10-NEXT: v_and_b32_e32 v27, 1, v27
+; GFX10-NEXT: v_cmp_eq_u32_e64 s14, 1, v10
+; GFX10-NEXT: v_cmp_eq_u32_e64 s15, 1, v8
+; GFX10-NEXT: v_cmp_eq_u32_e64 s16, 1, v6
+; GFX10-NEXT: v_cmp_eq_u32_e64 s17, 1, v4
+; GFX10-NEXT: v_cmp_eq_u32_e64 s18, 1, v2
+; GFX10-NEXT: v_cmp_eq_u32_e64 s19, 1, v0
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
+; GFX10-NEXT: v_cmp_eq_u32_e64 s20, 1, v27
+; GFX10-NEXT: v_cmp_eq_u32_e64 s21, 1, v25
+; GFX10-NEXT: v_cmp_eq_u32_e64 s22, 1, v23
+; GFX10-NEXT: v_cmp_eq_u32_e64 s23, 1, v21
+; GFX10-NEXT: v_cmp_eq_u32_e64 s24, 1, v19
+; GFX10-NEXT: v_cmp_eq_u32_e64 s25, 1, v17
+; GFX10-NEXT: v_cmp_eq_u32_e64 s26, 1, v15
+; GFX10-NEXT: v_cmp_eq_u32_e64 s27, 1, v13
+; GFX10-NEXT: v_cmp_eq_u32_e64 s28, 1, v11
+; GFX10-NEXT: v_cmp_eq_u32_e64 s29, 1, v7
+; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_hi, 1, v3
+; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v1
+; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v5
+; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v9
+; GFX10-NEXT: s_waitcnt vmcnt(32)
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v31
+; GFX10-NEXT: s_waitcnt vmcnt(31)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v32
+; GFX10-NEXT: s_waitcnt vmcnt(30)
+; GFX10-NEXT: v_and_b32_e32 v2, 1, v33
+; GFX10-NEXT: s_waitcnt vmcnt(29)
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v34
+; GFX10-NEXT: s_waitcnt vmcnt(28)
+; GFX10-NEXT: v_cndmask_b32_e64 v15, v34, v35, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v35
+; GFX10-NEXT: v_cndmask_b32_e64 v17, v32, v31, s5
+; GFX10-NEXT: s_waitcnt vmcnt(25)
+; GFX10-NEXT: v_cndmask_b32_e64 v19, v37, v38, s7
+; GFX10-NEXT: s_waitcnt vmcnt(24)
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v39
+; GFX10-NEXT: s_waitcnt vmcnt(23)
+; GFX10-NEXT: v_cndmask_b32_e64 v13, v39, v48, s6
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v48
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v38
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v36
+; GFX10-NEXT: s_waitcnt vmcnt(18)
+; GFX10-NEXT: v_cndmask_b32_e64 v27, v52, v53, s10
+; GFX10-NEXT: s_waitcnt vmcnt(17)
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v54
+; GFX10-NEXT: s_waitcnt vmcnt(16)
+; GFX10-NEXT: v_cndmask_b32_e64 v21, v54, v55, s9
+; GFX10-NEXT: s_waitcnt vmcnt(15)
+; GFX10-NEXT: v_cndmask_b32_e64 v11, v64, v36, s8
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v64
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v55
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v53
+; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v52
+; GFX10-NEXT: v_cndmask_b32_e64 v33, v50, v51, s11
+; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v51
+; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v50
; GFX10-NEXT: s_waitcnt vmcnt(9)
-; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v32
-; GFX10-NEXT: s_waitcnt vmcnt(8)
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v33
-; GFX10-NEXT: s_waitcnt vmcnt(7)
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v34, v33, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v36, v30, v49, s12
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v49
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX10-NEXT: v_cndmask_b32_e64 v38, v29, v68, s13
+; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v68
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; GFX10-NEXT: s_waitcnt vmcnt(6)
-; GFX10-NEXT: v_and_b32_e32 v35, 1, v35
-; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v49, v24, v22, s15
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: s_waitcnt vmcnt(5)
+; GFX10-NEXT: v_cndmask_b32_e64 v50, v67, v20, s16
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v67
; GFX10-NEXT: s_waitcnt vmcnt(4)
-; GFX10-NEXT: v_cndmask_b32_e32 v54, v36, v37, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
-; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v35
-; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v34
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v32, v31, s6
-; GFX10-NEXT: s_clause 0x6
-; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68
-; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
-; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
-; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76
-; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12
-; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80
-; GFX10-NEXT: v_cndmask_b32_e64 v30, v50, v30, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc_lo
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:124
-; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
-; GFX10-NEXT: v_and_b32_e32 v28, 1, v29
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v51, v13, s5
-; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v52
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v29, v36, v37, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
-; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
-; GFX10-NEXT: v_cndmask_b32_e32 v28, v36, v37, vcc_lo
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120
-; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
-; GFX10-NEXT: v_and_b32_e32 v26, 1, v27
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
-; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
-; GFX10-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc_lo
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116
-; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
-; GFX10-NEXT: v_and_b32_e32 v24, 1, v25
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v25, v36, v37, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
-; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
-; GFX10-NEXT: v_cndmask_b32_e32 v24, v36, v37, vcc_lo
-; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
-; GFX10-NEXT: v_and_b32_e32 v22, 1, v23
-; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v49
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v23, v49, v36, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
-; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v53
-; GFX10-NEXT: v_cndmask_b32_e32 v22, v37, v36, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20
-; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v48
-; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v39
-; GFX10-NEXT: v_cndmask_b32_e32 v20, v39, v48, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
-; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16
-; GFX10-NEXT: v_cndmask_b32_e32 v21, v37, v36, vcc_lo
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100
-; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX10-NEXT: v_cndmask_b32_e64 v52, v66, v18, s17
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_cndmask_b32_e64 v48, v28, v26, s14
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v66
+; GFX10-NEXT: v_cndmask_b32_e64 v54, v65, v16, s18
+; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v65
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v36, v37, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
-; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX10-NEXT: v_cndmask_b32_e32 v14, v38, v39, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17
-; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v39
-; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v38
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v36, v37, vcc_lo
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88
-; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX10-NEXT: v_cndmask_b32_e32 v15, v38, v39, vcc_lo
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84
-; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_cndmask_b32_e32 v10, v36, v37, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
-; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: v_cndmask_b32_e64 v64, v14, v12, s19
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v65, v1, v0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v66, v6, v5, s20
+; GFX10-NEXT: v_cndmask_b32_e64 v67, v8, v7, s21
+; GFX10-NEXT: v_cndmask_b32_e64 v68, v10, v9, s22
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v25, v23, s23
+; GFX10-NEXT: v_cndmask_b32_e64 v9, v32, v31, s24
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v35, v34, s25
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v30, v37, s26
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v29, v39, s27
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v28, v26, s28
+; GFX10-NEXT: v_cndmask_b32_e64 v20, v51, v20, s29
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s30
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, vcc_hi
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s31
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s34
+; GFX10-NEXT: v_cndmask_b32_e64 v16, v4, v3, s4
+; GFX10-NEXT: v_perm_b32 v0, v0, v64, 0x5040100
+; GFX10-NEXT: v_perm_b32 v1, v1, v54, 0x5040100
+; GFX10-NEXT: v_perm_b32 v2, v2, v52, 0x5040100
+; GFX10-NEXT: v_perm_b32 v3, v20, v50, 0x5040100
+; GFX10-NEXT: v_perm_b32 v4, v12, v49, 0x5040100
+; GFX10-NEXT: v_perm_b32 v5, v5, v48, 0x5040100
+; GFX10-NEXT: v_perm_b32 v6, v6, v38, 0x5040100
+; GFX10-NEXT: v_perm_b32 v7, v7, v36, 0x5040100
+; GFX10-NEXT: v_perm_b32 v8, v8, v33, 0x5040100
+; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x5040100
+; GFX10-NEXT: v_perm_b32 v10, v10, v21, 0x5040100
+; GFX10-NEXT: v_perm_b32 v11, v68, v11, 0x5040100
+; GFX10-NEXT: v_perm_b32 v12, v67, v19, 0x5040100
+; GFX10-NEXT: v_perm_b32 v13, v66, v13, 0x5040100
+; GFX10-NEXT: v_perm_b32 v14, v65, v17, 0x5040100
+; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
+; GFX10-NEXT: v_readlane_b32 s34, v40, 2
+; GFX10-NEXT: v_readlane_b32 s31, v40, 1
+; GFX10-NEXT: v_readlane_b32 s30, v40, 0
+; GFX10-NEXT: s_or_saveexec_b32 s4, -1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v38, v39, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v39
-; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v38
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v53, v48, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v48
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v34, v52, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v32, v33, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v33
-; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v31, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v37, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v49, v48, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v32, v33, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v19, v31, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v34, v50, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
-; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
-; GFX10-NEXT: v_perm_b32 v6, v30, v12, 0x5040100
-; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
-; GFX10-NEXT: v_cndmask_b32_e32 v9, v38, v39, vcc_lo
-; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
-; GFX10-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
-; GFX10-NEXT: v_perm_b32 v10, v21, v20, 0x5040100
-; GFX10-NEXT: v_perm_b32 v11, v22, v23, 0x5040100
-; GFX10-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
-; GFX10-NEXT: v_perm_b32 v8, v17, v16, 0x5040100
-; GFX10-NEXT: v_perm_b32 v9, v13, v18, 0x5040100
-; GFX10-NEXT: v_perm_b32 v12, v24, v25, 0x5040100
-; GFX10-NEXT: v_perm_b32 v13, v26, v27, 0x5040100
-; GFX10-NEXT: v_perm_b32 v14, v28, v29, 0x5040100
-; GFX10-NEXT: v_perm_b32 v15, v35, v54, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_vselect_v32bf16:
@@ -39646,229 +39620,241 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: s_clause 0x1f
; GFX11TRUE16-NEXT: scratch_load_u16 v31, off, s32
-; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:128
-; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:64
-; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:124
-; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:60
-; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:120
-; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:56
-; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:116
-; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:52
-; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:112
-; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:48
-; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:108
-; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:44
-; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:104
-; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:40
-; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:100
-; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:36
-; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:96
-; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:32
-; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:92
-; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:28
-; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:88
-; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:24
-; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:84
-; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:20
-; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:80
-; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:16
-; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:76
-; GFX11TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:12
-; GFX11TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:72
-; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:8
-; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:68
-; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22
-; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24
-; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
-; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
+; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:64
+; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:128
+; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:60
+; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:124
+; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:56
+; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:120
+; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:52
+; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:116
+; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:48
+; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:112
+; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:44
+; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:108
+; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:40
+; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:104
+; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:36
+; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:100
+; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:32
+; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:96
+; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:28
+; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:92
+; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:24
+; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:88
+; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:20
+; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:84
+; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:16
+; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:80
+; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:12
+; GFX11TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:76
+; GFX11TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:8
+; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:72
+; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:4
+; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:68
; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30
+; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
+; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
+; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24
+; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
+; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
+; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(31)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v32
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v32, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(29)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v98, 16, v34
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(27)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v100, 16, v36
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(25)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v102, 16, v38
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v34, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(21)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v50
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v115, 16, v51
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(19)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v116, 16, v52
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v26, v37, v36, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(17)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v118, 16, v54
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v119, 16, v55
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v117, 16, v53
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v24, v39, v38, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v112, 16, v48
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v113, 16, v49
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v103, 16, v39
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v22, v49, v48, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20
; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v48.l, v115.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v49.l, v114.l
; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v20, v51, v50, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18
; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v50.l, v117.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v51.l, v116.l
; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v53, v52, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v52.l, v119.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v53.l, v118.l
; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v38.l, v113.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v39.l, v112.l
+; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v16, v55, v54, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 1, v17
-; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
-; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 1, v19
-; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
-; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 1, v21
-; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
-; GFX11TRUE16-NEXT: v_and_b32_e32 v23, 1, v23
-; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 1, v25
-; GFX11TRUE16-NEXT: v_and_b32_e32 v27, 1, v27
-; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s22, 1, v22
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s24, 1, v24
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v30
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s27, 1, v26
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v28
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v101, 16, v37
; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v10
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v12
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v17
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s16, 1, v16
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v19
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s18, 1, v18
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v21
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s20, 1, v20
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v23
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s23, 1, v25
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s25, 1, v27
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s28, 1, v29
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v31
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v32
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v32.l, v33.l, s26
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v33
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v34.l, v35.l, s29
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v35
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v34
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v36.l, v37.l, s27
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v37
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v36
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v38.l, v39.l, s24
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v39
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v38
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v48.l, v49.l, s22
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v49
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v48
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v53
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v52
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v36.l, v103.l
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v65
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v64
-; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v50.l, v51.l, s20
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v68
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v65, v64, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 1, v19
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v37.l, v102.l
+; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v97, 16, v33
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v67, v66, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 1, v21
+; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v35
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v69
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v70
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v69, v68, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11TRUE16-NEXT: v_and_b32_e32 v23, 1, v23
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v128, 16, v64
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v65
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v130, 16, v66
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v71
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v80
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v71, v70, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 1, v25
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v131, 16, v67
+; GFX11TRUE16-NEXT: v_mov_b16_e64 v54.l, v129.l
+; GFX11TRUE16-NEXT: v_mov_b16_e64 v55.l, v128.l
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v81
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v82
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v81, v80, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11TRUE16-NEXT: v_and_b32_e32 v27, 1, v27
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v68
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v133, 16, v69
+; GFX11TRUE16-NEXT: v_mov_b16_e64 v64.l, v131.l
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v83
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v84
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v83, v82, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
+; GFX11TRUE16-NEXT: v_mov_b16_e64 v65.l, v130.l
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v70
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v71
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v85
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v86
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v85, v84, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 1, v31
+; GFX11TRUE16-NEXT: v_mov_b16_e64 v66.l, v133.l
+; GFX11TRUE16-NEXT: v_mov_b16_e64 v67.l, v132.l
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v144, 16, v80
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v86.l, v87.l, s0
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v87
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v8
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v51
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v50
-; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v52.l, v53.l, s18
-; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v54.l, v55.l, s16
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v55
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v54
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v64.l, v65.l, s14
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v66.l, v67.l, s12
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v67
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v66
-; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v70.l, v71.l, s8
-; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v82.l, v83.l, s4
-; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v10.l, v9.l, s28
-; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v12.l, v11.l, s25
-; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v14.l, v13.l, s23
-; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v18.l, v15.l, s21
-; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v22.l, v21.l, s17
-; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v26.l, v25.l, s13
-; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v30.l, v29.l, s9
-; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v32.l, v31.l, s7
-; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v34.l, v33.l, s5
-; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v36.l, v35.l, s3
-; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v38.l, v37.l, s1
-; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v48.l, v39.l, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v17.l, v16.l, s0
-; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v68.l, v69.l, s10
-; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v80.l, v81.l, s6
-; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v84.l, v85.l, s2
-; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v20.l, v19.l, s19
-; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v24.l, v23.l, s15
-; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v28.l, v27.l, s11
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v18.l, v7.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v21.l, v4.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v23.l, v3.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v24.l, v3.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v25.l, v2.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v26.l, v2.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v27.l, v1.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v28.l, v1.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v29.l, v0.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v30.l, v0.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v15.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v14.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v14.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v13.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v13.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v10.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v31.l, v9.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v32.l, v8.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v33.l, v8.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v15.h
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v18, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v1, v1, v7, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v2, v2, v19, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v3, v3, v6, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v4, v4, v20, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v5, v13, v5, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v6, v12, v21, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v7, v14, v22, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v8, v11, v23, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v9, v16, v24, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v10, v10, v25, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v11, v17, v26, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v12, v31, v27, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v13, v32, v28, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v14, v33, v29, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v15, v15, v30, 0x5040100
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v87, v86, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v145, 16, v81
+; GFX11TRUE16-NEXT: v_mov_b16_e64 v68.l, v135.l
+; GFX11TRUE16-NEXT: v_mov_b16_e64 v69.l, v134.l
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v146, 16, v82
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v31, v97, v96, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v147, 16, v83
+; GFX11TRUE16-NEXT: v_mov_b16_e64 v70.l, v145.l
+; GFX11TRUE16-NEXT: v_mov_b16_e64 v71.l, v144.l
+; GFX11TRUE16-NEXT: v_mov_b16_e64 v81.l, v146.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v99, v98, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v27
+; GFX11TRUE16-NEXT: v_mov_b16_e64 v80.l, v147.l
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v84
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v85
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v86
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v101, v100, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v25
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v87
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.h, v29.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v25, v36, v37, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v23
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.h, v25.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v23, v38, v39, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v21, v48, v49, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v19
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.h, v21.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v19, v50, v51, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v52, v53, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v32.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.h, v31.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.h, v17.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v36, v54, v55, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v26.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.h, v27.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v37, v64, v65, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v22.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.h, v23.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.h, v37.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v38, v66, v67, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v18.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.h, v19.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v39, v68, v69, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v14.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v34.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.h, v36.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.h, v39.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v48, v70, v71, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v20.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.h, v38.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v49, v80, v81, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v6.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v12.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v49.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v30, v33, v30, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v16.l
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v28, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v30.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v48.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v28.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v32bf16:
@@ -40527,8 +40513,8 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo
; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v1, 0x7060302
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v3, 16
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fma_v3bf16:
@@ -41498,7 +41484,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmuladd_v3bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index ff1c3da1d5fe5a..e5df5d3e77a239 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -480,8 +480,11 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -542,14 +545,8 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v2, 0xffff8000, v2, v3
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v4.l, v2.l
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -607,9 +604,12 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -658,14 +658,8 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v2.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -734,13 +728,16 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v4, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -806,20 +803,14 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v5, 0x7fff
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v2, 0xffff8000, v2, v5
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v3, 0xffff8000, v3, v5
+; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v4, 0xffff8000, v4, v5
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v3, 0xffff8000, v4, v5
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -886,9 +877,12 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -945,14 +939,8 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v4.l, v5.l
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -1028,21 +1016,27 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x8000
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v5, v5, s0, 0x7fff
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v6.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v7.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1125,27 +1119,15 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v2, 0xffff8000, v2, v6
-; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v4, 0xffff8000, v4, v6
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v3, 0xffff8000, v3, v6
+; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v4, 0xffff8000, v4, v6
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v5, 0xffff8000, v5, v6
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v1.l, v4.l
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v7.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v7.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v8.l, v5.l
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v0
-; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v2
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v4
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1227,14 +1209,20 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v1.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v6.l, v5.l
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i16:
@@ -1305,22 +1293,10 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v1.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v4.l, v6.l
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v4.l, v6.l
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v5.l, v7.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v2
-; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v4
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v4f16_v4i16:
>From b9bbb433cf5e13b2c439e5eccba28dc7ac3f4501 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 3 Feb 2025 18:08:16 -0500
Subject: [PATCH 2/2] patch2
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 15 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 3 +-
llvm/test/CodeGen/AMDGPU/bf16.ll | 7197 ++++++++---------
llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 49 +-
llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll | 72 +-
.../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 51 +-
llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll | 20 +-
7 files changed, 3662 insertions(+), 3745 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 4b265450d38b51..12596895ace71a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -782,22 +782,9 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
return true;
// TODO: This should probably be a combine somewhere
+ // (build_vector $src0, undef) -> copy $src0
MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
- if (Subtarget->useRealTrue16Insts() && IsVector) {
- // (vecTy (DivergentBinFrag<build_vector> Ty:$src0, (Ty undef))),
- // -> (vecTy (INSERT_SUBREG (IMPLICIT_DEF), VGPR_16:$src0, lo16))
- Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(*BB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
- BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::INSERT_SUBREG), Dst)
- .addReg(Undef)
- .addReg(Src0)
- .addImm(AMDGPU::lo16);
- MI.eraseFromParent();
- return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) &&
- RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_16RegClass, *MRI);
- }
- // (build_vector $src0, undef) -> copy $src0
MI.setDesc(TII.get(AMDGPU::COPY));
MI.removeOperand(2);
const auto &RC =
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 50b9f895b3b476..d113deea501581 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3411,9 +3411,8 @@ def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
(REG_SEQUENCE VGPR_32, VGPR_16:$a, lo16, VGPR_16:$b, hi16)
>;
-// GISel ignores this Pat, but the equivalent is done in selectG_BUILD_VECTOR
def : GCNPat <
- (vecTy (build_vector (Ty VGPR_16:$src0), (Ty undef))),
+ (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$src0), (Ty undef))),
(REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16)
>;
}
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 230a62f447938c..d10491e2fc8799 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -662,14 +662,12 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0
; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x6c, v0
; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x68, v0
-; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0
-; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0
+; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0
; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0
-; GCN-NEXT: v_add_i32_e32 v16, vcc, 0x58, v0
-; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
@@ -677,9 +675,9 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
-; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0
-; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0
-; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v12, s[0:3], 0 offen
@@ -687,60 +685,63 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GCN-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
-; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0
-; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0
-; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0
+; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x48, v0
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x44, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v6, v15, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v5, v16, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v4, v17, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 64, v0
+; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0
; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0
-; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0
-; GCN-NEXT: v_add_i32_e32 v22, vcc, 48, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v4, v10, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v3, v11, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v5, v11, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v4, v12, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
-; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0
; GCN-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
; GCN-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v1, vcc, 44, v0
-; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 40, v0
-; GCN-NEXT: buffer_store_dword v4, v21, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 48, v0
+; GCN-NEXT: buffer_store_dword v9, v20, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0
+; GCN-NEXT: buffer_store_dword v8, v21, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v4, vcc, 36, v0
-; GCN-NEXT: buffer_store_dword v3, v22, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 40, v0
+; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 36, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v3, vcc, 32, v0
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0
-; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 28, v0
+; GCN-NEXT: v_add_i32_e32 v10, vcc, 24, v0
; GCN-NEXT: v_add_i32_e32 v19, vcc, 20, v0
-; GCN-NEXT: v_add_i32_e32 v20, vcc, 16, v0
-; GCN-NEXT: s_waitcnt vmcnt(6)
-; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v1, vcc, 12, v0
-; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0
-; GCN-NEXT: buffer_store_dword v8, v4, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v0
-; GCN-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 16, v0
+; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 12, v0
+; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0
+; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_add_i32_e32 v3, vcc, 4, v0
; GCN-NEXT: s_waitcnt vmcnt(8)
-; GCN-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v17, v6, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v17, v10, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -758,14 +759,6 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0
; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0
; GFX7-NEXT: v_add_i32_e32 v19, vcc, 52, v0
-; GFX7-NEXT: v_add_i32_e32 v20, vcc, 48, v0
-; GFX7-NEXT: v_add_i32_e32 v21, vcc, 44, v0
-; GFX7-NEXT: v_add_i32_e32 v22, vcc, 40, v0
-; GFX7-NEXT: v_add_i32_e32 v23, vcc, 36, v0
-; GFX7-NEXT: v_add_i32_e32 v24, vcc, 32, v0
-; GFX7-NEXT: v_add_i32_e32 v25, vcc, 28, v0
-; GFX7-NEXT: v_add_i32_e32 v26, vcc, 24, v0
-; GFX7-NEXT: v_add_i32_e32 v27, vcc, 20, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
@@ -809,26 +802,34 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0
; GFX7-NEXT: s_waitcnt vmcnt(3)
; GFX7-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 16, v0
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 48, v0
; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 12, v0
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 44, v0
; GFX7-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 8, v0
-; GFX7-NEXT: buffer_store_dword v3, v20, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 40, v0
+; GFX7-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 36, v0
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 28, v0
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 24, v0
+; GFX7-NEXT: v_add_i32_e32 v19, vcc, 20, v0
; GFX7-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NEXT: buffer_store_dword v10, v21, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v9, v22, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v8, v23, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v7, v24, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0
+; GFX7-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 12, v0
+; GFX7-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0
+; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0
; GFX7-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NEXT: buffer_store_dword v14, v25, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v13, v26, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v12, v27, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v13, v6, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(12)
-; GFX7-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen
; GFX7-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -1335,83 +1336,83 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-LABEL: v_store_global_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GCN-NEXT: v_alignbit_b32 v21, v23, v22, 16
+; GCN-NEXT: v_alignbit_b32 v20, v31, v20, 16
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v5
; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16
-; GCN-NEXT: v_alignbit_b32 v4, v31, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v16, v4, 16
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v12
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_alignbit_b32 v13, v0, v1, 16
-; GCN-NEXT: v_alignbit_b32 v12, v6, v7, 16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v10
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_alignbit_b32 v11, v0, v1, 16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_alignbit_b32 v10, v0, v1, 16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v20
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_alignbit_b32 v9, v0, v1, 16
-; GCN-NEXT: v_alignbit_b32 v8, v6, v7, 16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_alignbit_b32 v7, v0, v1, 16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v16
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v29
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v6
-; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT: v_alignbit_b32 v6, v0, v1, 16
-; GCN-NEXT: v_alignbit_b32 v16, v16, v14, 16
-; GCN-NEXT: v_alignbit_b32 v15, v15, v17, 16
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3
+; GCN-NEXT: v_alignbit_b32 v3, v0, v2, 16
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v24
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_alignbit_b32 v14, v0, v14, 16
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v30
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v30
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GCN-NEXT: v_alignbit_b32 v9, v6, v14, 16
+; GCN-NEXT: v_alignbit_b32 v8, v13, v12, 16
+; GCN-NEXT: v_alignbit_b32 v7, v11, v10, 16
+; GCN-NEXT: v_alignbit_b32 v6, v15, v16, 16
+; GCN-NEXT: v_alignbit_b32 v12, v28, v17, 16
+; GCN-NEXT: v_alignbit_b32 v11, v22, v23, 16
+; GCN-NEXT: v_alignbit_b32 v10, v25, v24, 16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
-; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: s_waitcnt expcnt(1)
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v17
-; GCN-NEXT: v_alignbit_b32 v17, v6, v18, 16
-; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v26
+; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13
+; GCN-NEXT: v_alignbit_b32 v13, v6, v27, 16
+; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:48
; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -1421,78 +1422,78 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX7-NEXT: v_alignbit_b32 v25, v25, v24, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v5
; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13
; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v10
-; GFX7-NEXT: v_alignbit_b32 v11, v0, v1, 16
; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
-; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v9
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_alignbit_b32 v11, v7, v10, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v30
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v27
+; GFX7-NEXT: v_alignbit_b32 v27, v29, v28, 16
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_alignbit_b32 v26, v31, v26, 16
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: v_alignbit_b32 v4, v24, v4, 16
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v28, v7, v6, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v9
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v8
; GFX7-NEXT: v_alignbit_b32 v10, v6, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v23
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22
; GFX7-NEXT: v_alignbit_b32 v9, v6, v7, 16
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20
-; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v21
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v20
; GFX7-NEXT: v_alignbit_b32 v7, v6, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17
+; GFX7-NEXT: v_alignbit_b32 v8, v8, v14, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v16
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v15, 16
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v24
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: v_alignbit_b32 v4, v31, v4, 16
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_alignbit_b32 v17, v14, v15, 16
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28
-; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26
-; GFX7-NEXT: v_alignbit_b32 v15, v14, v15, 16
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_alignbit_b32 v14, v14, v18, 16
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16
+; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: buffer_store_dwordx4 v[25:28], v[0:1], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
@@ -1564,207 +1565,203 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v16
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132
-; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s4, s6
-; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[16:17], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_alignbit_b32 v18, v17, v16, 16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13
; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16
-; GCN-NEXT: v_alignbit_b32 v12, v18, v12, 16
+; GCN-NEXT: v_alignbit_b32 v12, v16, v12, 16
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v26
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16
; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16
-; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16
-; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128
-; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124
+; GCN-NEXT: v_alignbit_b32 v3, v7, v6, 16
+; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16
+; GCN-NEXT: v_alignbit_b32 v1, v22, v14, 16
+; GCN-NEXT: v_alignbit_b32 v0, v23, v0, 16
+; GCN-NEXT: v_alignbit_b32 v6, v26, v15, 16
+; GCN-NEXT: v_alignbit_b32 v5, v16, v17, 16
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:136
+; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132
+; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:124
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:112
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: s_waitcnt vmcnt(6)
+; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:16
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:120
-; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:116
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v30
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16
+; GCN-NEXT: s_waitcnt vmcnt(14)
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: s_waitcnt vmcnt(13)
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: s_waitcnt vmcnt(12)
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: s_waitcnt vmcnt(11)
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: s_waitcnt vmcnt(10)
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: s_waitcnt vmcnt(7)
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: s_waitcnt vmcnt(6)
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v11
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12
+; GCN-NEXT: s_waitcnt vmcnt(4)
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112
-; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:108
-; GCN-NEXT: v_alignbit_b32 v11, v8, v9, 16
-; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v8, v8, v12, 16
-; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:112
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96
-; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:92
-; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
-; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v12
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GCN-NEXT: v_alignbit_b32 v13, v7, v14, 16
+; GCN-NEXT: v_alignbit_b32 v12, v15, v16, 16
+; GCN-NEXT: v_alignbit_b32 v11, v17, v22, 16
+; GCN-NEXT: v_alignbit_b32 v10, v10, v23, 16
+; GCN-NEXT: v_alignbit_b32 v17, v20, v25, 16
+; GCN-NEXT: v_alignbit_b32 v16, v21, v18, 16
+; GCN-NEXT: v_alignbit_b32 v15, v26, v19, 16
+; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:32
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(7)
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: s_waitcnt vmcnt(6)
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: s_waitcnt vmcnt(4)
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80
-; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:76
-; GCN-NEXT: v_alignbit_b32 v11, v8, v9, 16
-; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:68
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v8, v8, v12, 16
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v30
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v29
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24
-; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:96
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32
-; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:32
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5
-; GCN-NEXT: v_alignbit_b32 v3, v1, v6, 16
-; GCN-NEXT: v_alignbit_b32 v2, v2, v4, 16
-; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28
-; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12
-; GCN-NEXT: v_alignbit_b32 v1, v1, v13, 16
-; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v14
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; GCN-NEXT: v_alignbit_b32 v0, v4, v0, 16
-; GCN-NEXT: v_alignbit_b32 v6, v5, v19, 16
-; GCN-NEXT: v_alignbit_b32 v5, v13, v21, 16
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12
-; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8
-; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v22
-; GCN-NEXT: v_alignbit_b32 v4, v4, v23, 16
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64
-; GCN-NEXT: s_waitcnt vmcnt(9)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: s_waitcnt vmcnt(8)
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v21
+; GCN-NEXT: v_alignbit_b32 v14, v7, v14, 16
+; GCN-NEXT: v_alignbit_b32 v7, v18, v24, 16
+; GCN-NEXT: v_alignbit_b32 v21, v19, v20, 16
+; GCN-NEXT: v_alignbit_b32 v20, v25, v22, 16
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:64
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
+; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:48
; GCN-NEXT: s_waitcnt vmcnt(7)
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v7
-; GCN-NEXT: s_waitcnt vmcnt(6)
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v10
-; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v7
-; GCN-NEXT: v_alignbit_b32 v7, v8, v15, 16
-; GCN-NEXT: v_alignbit_b32 v11, v9, v20, 16
-; GCN-NEXT: v_alignbit_b32 v10, v21, v10, 16
-; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:56
-; GCN-NEXT: s_waitcnt vmcnt(6)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v12
-; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v13
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52
-; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v23
+; GCN-NEXT: v_alignbit_b32 v19, v19, v18, 16
; GCN-NEXT: s_waitcnt vmcnt(6)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14
-; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v18
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v8, v8, v14, 16
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(5)
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_alignbit_b32 v18, v18, v22, 16
; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v25
; GCN-NEXT: s_waitcnt vmcnt(3)
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GCN-NEXT: v_alignbit_b32 v15, v14, v15, 16
-; GCN-NEXT: v_alignbit_b32 v14, v19, v12, 16
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40
-; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:36
-; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_alignbit_b32 v25, v22, v23, 16
+; GCN-NEXT: v_alignbit_b32 v24, v24, v26, 16
+; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:44
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v29
+; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_alignbit_b32 v23, v23, v22, 16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19
-; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: v_alignbit_b32 v12, v12, v18, 16
-; GCN-NEXT: buffer_store_dwordx4 v[12:15], v[16:17], s[4:7], 0 addr64 offset:80
-; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[16:17], s[4:7], 0 addr64 offset:64
-; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[16:17], s[4:7], 0 addr64 offset:48
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[16:17], s[4:7], 0 addr64
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_alignbit_b32 v22, v22, v26, 16
+; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[8:9], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[8:9], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: buffer_store_dwordx4 v[22:25], v[8:9], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_store_dwordx4 v[18:21], v[8:9], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -1780,24 +1777,27 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104
; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v28
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: s_waitcnt vmcnt(7)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
@@ -1832,16 +1832,97 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76
; GFX7-NEXT: s_waitcnt vmcnt(6)
; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:112
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v37
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v38
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v39
+; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v49
+; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v48
+; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v50
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
+; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
+; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68
+; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:64
+; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60
+; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56
+; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52
+; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:48
+; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
+; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
+; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39
+; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48
+; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50
+; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51
+; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
+; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
+; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36
+; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32
+; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28
+; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24
+; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20
+; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16
+; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
+; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
+; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:80
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v49
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v38
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v39
+; GFX7-NEXT: v_mul_f32_e32 v35, 1.0, v48
+; GFX7-NEXT: v_alignbit_b32 v36, v33, v34, 16
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v50
+; GFX7-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v34, 1.0, v51
+; GFX7-NEXT: v_alignbit_b32 v35, v35, v37, 16
+; GFX7-NEXT: v_alignbit_b32 v34, v33, v34, 16
+; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4
+; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v37, 1.0, v37
+; GFX7-NEXT: v_alignbit_b32 v33, v33, v37, 16
+; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:64
; GFX7-NEXT: s_nop 0
; GFX7-NEXT: v_lshrrev_b32_e32 v33, 16, v5
; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
-; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72
-; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68
-; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32
; GFX7-NEXT: v_alignbit_b32 v13, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
@@ -1852,124 +1933,39 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v8
; GFX7-NEXT: v_alignbit_b32 v10, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v12
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v22
+; GFX7-NEXT: v_alignbit_b32 v12, v6, v7, 16
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v21
; GFX7-NEXT: v_alignbit_b32 v9, v0, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v21
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v20
-; GFX7-NEXT: v_alignbit_b32 v8, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v20
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v18
+; GFX7-NEXT: v_alignbit_b32 v8, v6, v7, 16
; GFX7-NEXT: v_alignbit_b32 v7, v0, v1, 16
-; GFX7-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v37
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v28
-; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v38
-; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16
-; GFX7-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v39
-; GFX7-NEXT: v_alignbit_b32 v36, v0, v1, 16
-; GFX7-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v49
-; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v48
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v50
-; GFX7-NEXT: v_alignbit_b32 v35, v18, v19, 16
-; GFX7-NEXT: v_alignbit_b32 v34, v0, v1, 16
-; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
-; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28
-; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:24
-; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:16
-; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12
-; GFX7-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_alignbit_b32 v33, v6, v14, 16
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v17
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v16
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v14, 16
-; GFX7-NEXT: s_waitcnt vmcnt(7)
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v15
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30
-; GFX7-NEXT: buffer_store_dwordx4 v[33:36], v[31:32], s[4:7], 0 addr64 offset:96
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v29
-; GFX7-NEXT: v_alignbit_b32 v17, v14, v15, 16
-; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:52
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v27
-; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v26
-; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:44
-; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:8
-; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
-; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56
-; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40
-; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX7-NEXT: v_alignbit_b32 v15, v14, v15, 16
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v25
-; GFX7-NEXT: v_alignbit_b32 v16, v16, v20, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v24
-; GFX7-NEXT: v_alignbit_b32 v14, v14, v20, 16
-; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_alignbit_b32 v21, v0, v1, 16
-; GFX7-NEXT: s_waitcnt vmcnt(13)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v18
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_waitcnt vmcnt(12)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v19
-; GFX7-NEXT: v_alignbit_b32 v20, v0, v1, 16
-; GFX7-NEXT: s_waitcnt vmcnt(11)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v22
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_waitcnt vmcnt(10)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v23
-; GFX7-NEXT: v_alignbit_b32 v19, v0, v1, 16
-; GFX7-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v35
-; GFX7-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v29
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16
+; GFX7-NEXT: v_alignbit_b32 v6, v0, v1, 16
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v38
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_waitcnt vmcnt(4)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30
-; GFX7-NEXT: v_alignbit_b32 v18, v0, v1, 16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v28
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v33
-; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v34
-; GFX7-NEXT: v_alignbit_b32 v25, v0, v1, 16
+; GFX7-NEXT: v_alignbit_b32 v17, v0, v1, 16
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v27
-; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v26
-; GFX7-NEXT: v_alignbit_b32 v24, v22, v23, 16
-; GFX7-NEXT: v_alignbit_b32 v23, v0, v1, 16
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v36
+; GFX7-NEXT: v_alignbit_b32 v16, v14, v15, 16
+; GFX7-NEXT: v_alignbit_b32 v15, v0, v1, 16
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v25
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v37
-; GFX7-NEXT: v_alignbit_b32 v22, v0, v1, 16
-; GFX7-NEXT: buffer_store_dwordx4 v[22:25], v[31:32], s[4:7], 0 addr64 offset:80
-; GFX7-NEXT: buffer_store_dwordx4 v[18:21], v[31:32], s[4:7], 0 addr64 offset:64
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v24
+; GFX7-NEXT: v_alignbit_b32 v14, v0, v1, 16
+; GFX7-NEXT: v_alignbit_b32 v4, v33, v4, 16
; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[31:32], s[4:7], 0 addr64 offset:48
; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[31:32], s[4:7], 0 addr64 offset:32
; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[31:32], s[4:7], 0 addr64 offset:16
@@ -3802,10 +3798,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v2, 1
; GCN-NEXT: v_readlane_b32 s30, v2, 0
+; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -3833,10 +3829,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v2, 1
; GFX7-NEXT: v_readlane_b32 s30, v2, 0
+; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -3862,10 +3858,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v2, 1
; GFX8-NEXT: v_readlane_b32 s30, v2, 0
+; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0xfc00
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -3891,10 +3887,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3921,11 +3917,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
+; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -3951,10 +3947,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -3994,10 +3990,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v4, 1
; GCN-NEXT: v_readlane_b32 s30, v4, 0
+; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -4030,10 +4026,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v4, 1
; GFX7-NEXT: v_readlane_b32 s30, v4, 0
+; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -4059,10 +4055,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v2, 1
; GFX8-NEXT: v_readlane_b32 s30, v2, 0
+; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0xfc00
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -4088,10 +4084,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v2, 1
; GFX9-NEXT: v_readlane_b32 s30, v2, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4118,11 +4114,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v2, 1
; GFX10-NEXT: v_readlane_b32 s30, v2, 0
+; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -4148,10 +4144,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v2, 1
; GFX11-NEXT: v_readlane_b32 s30, v2, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4193,10 +4189,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v5, 1
; GCN-NEXT: v_readlane_b32 s30, v5, 0
+; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -4231,10 +4227,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v4, 1
; GFX7-NEXT: v_readlane_b32 s30, v4, 0
+; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -4263,10 +4259,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v4, 1
; GFX8-NEXT: v_readlane_b32 s30, v4, 0
+; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0xfc00
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -4294,10 +4290,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4326,11 +4322,11 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v3, 1
; GFX10-NEXT: v_readlane_b32 s30, v3, 0
+; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -4358,10 +4354,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v3, 1
; GFX11-NEXT: v_readlane_b32 s30, v3, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4411,10 +4407,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v8, 1
; GCN-NEXT: v_readlane_b32 s30, v8, 0
+; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -4457,10 +4453,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v6, 1
; GFX7-NEXT: v_readlane_b32 s30, v6, 0
+; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -4489,10 +4485,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v4, 1
; GFX8-NEXT: v_readlane_b32 s30, v4, 0
+; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0xfc00
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -4520,10 +4516,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4552,11 +4548,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v3, 1
; GFX10-NEXT: v_readlane_b32 s30, v3, 0
+; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -4582,10 +4578,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v3, 1
; GFX11-NEXT: v_readlane_b32 s30, v3, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4655,10 +4651,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v16, 1
; GCN-NEXT: v_readlane_b32 s30, v16, 0
+; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -4721,10 +4717,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v10, 1
; GFX7-NEXT: v_readlane_b32 s30, v10, 0
+; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -4759,10 +4755,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v6, 1
; GFX8-NEXT: v_readlane_b32 s30, v6, 0
+; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0xfc00
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -4794,10 +4790,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v5, 1
; GFX9-NEXT: v_readlane_b32 s30, v5, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4830,11 +4826,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v5, 1
; GFX10-NEXT: v_readlane_b32 s30, v5, 0
+; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -4860,10 +4856,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v5, 1
; GFX11-NEXT: v_readlane_b32 s30, v5, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4880,12 +4876,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_mov_b32 s18, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
-; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[16:17]
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_writelane_b32 v21, s30, 0
-; GCN-NEXT: v_writelane_b32 v21, s31, 1
+; GCN-NEXT: v_writelane_b32 v20, s30, 0
+; GCN-NEXT: v_writelane_b32 v20, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16 at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16 at gotpcrel32@hi+12
@@ -4911,36 +4907,36 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: v_add_i32_e32 v17, vcc, 30, v16
; GCN-NEXT: v_add_i32_e32 v18, vcc, 28, v16
; GCN-NEXT: v_add_i32_e32 v19, vcc, 26, v16
-; GCN-NEXT: v_add_i32_e32 v20, vcc, 24, v16
; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NEXT: buffer_store_short v15, v17, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v15, vcc, 22, v16
-; GCN-NEXT: v_add_i32_e32 v17, vcc, 20, v16
+; GCN-NEXT: v_add_i32_e32 v15, vcc, 24, v16
+; GCN-NEXT: v_add_i32_e32 v17, vcc, 22, v16
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GCN-NEXT: buffer_store_short v14, v18, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v14, vcc, 18, v16
-; GCN-NEXT: v_add_i32_e32 v18, vcc, 16, v16
+; GCN-NEXT: v_add_i32_e32 v14, vcc, 20, v16
+; GCN-NEXT: v_add_i32_e32 v18, vcc, 18, v16
; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GCN-NEXT: buffer_store_short v13, v19, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v13, vcc, 14, v16
-; GCN-NEXT: v_add_i32_e32 v19, vcc, 12, v16
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 16, v16
+; GCN-NEXT: v_add_i32_e32 v19, vcc, 14, v16
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: buffer_store_short v12, v20, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v12, v15, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v12, vcc, 10, v16
-; GCN-NEXT: v_add_i32_e32 v20, vcc, 8, v16
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 12, v16
+; GCN-NEXT: v_add_i32_e32 v15, vcc, 10, v16
; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT: buffer_store_short v11, v15, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v11, v17, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v11, vcc, 6, v16
-; GCN-NEXT: v_add_i32_e32 v15, vcc, 4, v16
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 8, v16
+; GCN-NEXT: v_add_i32_e32 v17, vcc, 6, v16
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT: buffer_store_short v10, v17, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v10, v14, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT: v_add_i32_e32 v10, vcc, 2, v16
+; GCN-NEXT: v_add_i32_e32 v10, vcc, 4, v16
+; GCN-NEXT: v_add_i32_e32 v14, vcc, 2, v16
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@@ -4951,32 +4947,32 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT: buffer_store_short v9, v14, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v9, v18, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v8, v18, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v8, v13, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v7, v13, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v7, v19, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v6, v19, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v6, v12, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v5, v12, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v5, v15, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v4, v20, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v4, v11, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v3, v11, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v3, v17, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v2, v15, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v2, v10, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v1, v10, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_short v1, v14, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readlane_b32 s31, v21, 1
-; GCN-NEXT: v_readlane_b32 s30, v21, 0
+; GCN-NEXT: v_readlane_b32 s31, v20, 1
+; GCN-NEXT: v_readlane_b32 s30, v20, 0
+; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -5079,10 +5075,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v18, 1
; GFX7-NEXT: v_readlane_b32 s30, v18, 0
+; GFX7-NEXT: s_mov_b32 s32, s33
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_addk_i32 s32, 0xfc00
; GFX7-NEXT: s_mov_b32 s33, s18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -5129,10 +5125,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readlane_b32 s31, v10, 1
; GFX8-NEXT: v_readlane_b32 s30, v10, 0
+; GFX8-NEXT: s_mov_b32 s32, s33
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: s_addk_i32 s32, 0xfc00
; GFX8-NEXT: s_mov_b32 s33, s18
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -5172,10 +5168,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v9, 1
; GFX9-NEXT: v_readlane_b32 s30, v9, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s33, s18
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -5216,11 +5212,11 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_readlane_b32 s31, v9, 1
; GFX10-NEXT: v_readlane_b32 s30, v9, 0
+; GFX10-NEXT: s_mov_b32 s32, s33
; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
; GFX10-NEXT: s_mov_b32 s33, s18
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -5248,10 +5244,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_readlane_b32 s31, v9, 1
; GFX11-NEXT: v_readlane_b32 s30, v9, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -5365,10 +5361,10 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x50, v0
; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x44, v0
; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
@@ -5587,20 +5583,20 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
+; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:8
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32
-; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
@@ -5617,11 +5613,11 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX9-NEXT: s_waitcnt vmcnt(18)
-; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT: s_waitcnt vmcnt(18)
-; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT: s_waitcnt vmcnt(25)
+; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT: s_waitcnt vmcnt(25)
+; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:116
; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -7618,197 +7614,197 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GCN-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26
; GCN-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28
; GCN-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30
-; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:48
-; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:50
-; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:52
-; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:54
-; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:56
-; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:58
-; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:60
-; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:62
+; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:50
+; GCN-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:52
+; GCN-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:54
+; GCN-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:56
+; GCN-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:58
+; GCN-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:60
+; GCN-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:62
; GCN-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32
; GCN-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34
; GCN-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36
; GCN-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38
-; GCN-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:40
-; GCN-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:42
+; GCN-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
+; GCN-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
; GCN-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
; GCN-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
; GCN-NEXT: s_waitcnt vmcnt(8)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32
-; GCN-NEXT: v_add_i32_e32 v32, vcc, 0xfc, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30
+; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xfc, v0
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31
-; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xf4, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xf4, v0
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v31, vcc, 0xec, v0
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xec, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xe4, v0
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xe4, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xdc, v0
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xd8, v0
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xdc, v0
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd8, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28
-; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xd4, v0
-; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xd0, v0
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xcc, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc8, v0
-; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xc4, v0
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xc0, v0
+; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xd4, v0
+; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xd0, v0
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xcc, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xbc, v0
+; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xc8, v0
; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xb8, v0
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 0xb4, v0
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xc4, v0
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xc0, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb0, v0
-; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xac, v0
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0xa8, v0
-; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34
+; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xbc, v0
+; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xb8, v0
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0xb4, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xa4, v0
+; GCN-NEXT: v_add_i32_e32 v23, vcc, 0xb0, v0
; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa0, v0
-; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x9c, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 0xac, v0
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0xa8, v0
+; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x98, v0
+; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v24, vcc, 0xa4, v0
; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x94, v0
-; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x90, v0
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 0xa0, v0
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x9c, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x8c, v0
-; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x88, v0
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x84, v0
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x98, v0
+; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x94, v0
+; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x90, v0
; GCN-NEXT: s_waitcnt expcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x80, v0
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x8c, v0
; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x7c, v0
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x78, v0
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x88, v0
+; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x84, v0
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31
+; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x80, v0
+; GCN-NEXT: buffer_store_dword v1, v25, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x7c, v0
+; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x78, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v22, vcc, 0x74, v0
-; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x70, v0
-; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x6c, v0
+; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x70, v0
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x6c, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x68, v0
-; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x64, v0
-; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x60, v0
+; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x64, v0
+; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x60, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x5c, v0
-; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x58, v0
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0
+; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x58, v0
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x54, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v32, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x50, v0
-; GCN-NEXT: buffer_store_dword v1, v23, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x4c, v0
-; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x48, v0
+; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x4c, v0
+; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v26, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x44, v0
-; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v26, vcc, 64, v0
-; GCN-NEXT: v_add_i32_e32 v27, vcc, 60, v0
+; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0
+; GCN-NEXT: v_add_i32_e32 v31, vcc, 60, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v17, vcc, 56, v0
-; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v22, vcc, 52, v0
-; GCN-NEXT: v_add_i32_e32 v30, vcc, 48, v0
+; GCN-NEXT: v_add_i32_e32 v28, vcc, 48, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v31, vcc, 44, v0
+; GCN-NEXT: buffer_store_dword v2, v29, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v29, vcc, 44, v0
; GCN-NEXT: buffer_store_dword v1, v21, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v21, vcc, 40, v0
; GCN-NEXT: v_add_i32_e32 v33, vcc, 36, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v25, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v25, vcc, 32, v0
-; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0
+; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0
+; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v30, vcc, 28, v0
; GCN-NEXT: v_add_i32_e32 v34, vcc, 24, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v20, vcc, 20, v0
-; GCN-NEXT: buffer_store_dword v1, v24, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v24, vcc, 16, v0
+; GCN-NEXT: buffer_store_dword v1, v26, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v26, vcc, 16, v0
; GCN-NEXT: v_add_i32_e32 v35, vcc, 12, v0
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT: buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT: v_add_i32_e32 v28, vcc, 8, v0
+; GCN-NEXT: buffer_store_dword v2, v27, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v27, vcc, 8, v0
; GCN-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen
; GCN-NEXT: v_add_i32_e32 v19, vcc, 4, v0
; GCN-NEXT: s_waitcnt expcnt(0)
@@ -7824,34 +7820,34 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v8
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v11
-; GCN-NEXT: buffer_store_dword v2, v23, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen
; GCN-NEXT: v_cvt_f64_f32_e32 v[5:6], v10
-; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f64_f32_e32 v[1:2], v9
; GCN-NEXT: v_cvt_f64_f32_e32 v[7:8], v12
-; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v13
+; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v36
; GCN-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen
-; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v36
-; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen
+; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v13
+; GCN-NEXT: buffer_store_dword v3, v25, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v14
; GCN-NEXT: v_cvt_f64_f32_e32 v[13:14], v15
; GCN-NEXT: v_cvt_f64_f32_e32 v[15:16], v16
-; GCN-NEXT: buffer_store_dword v6, v27, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v6, v31, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v17, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v12, v31, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v11, v21, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v1, v28, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v10, v29, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v9, v21, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v16, v33, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v15, v25, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v15, v23, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v14, v30, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v13, v34, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v4, v20, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v3, v24, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v10, v35, s[0:3], 0 offen
-; GCN-NEXT: buffer_store_dword v9, v28, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v3, v26, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v12, v35, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v11, v27, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v8, v19, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -7864,258 +7860,258 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:62
-; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:60
-; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:58
-; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:56
-; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:54
-; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:52
-; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:50
-; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:34
-; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:36
-; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:38
-; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
-; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
-; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
-; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
-; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:2
-; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:6
-; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:8
-; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:10
-; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:12
+; GFX7-NEXT: buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:62
+; GFX7-NEXT: buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:60
+; GFX7-NEXT: buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:58
+; GFX7-NEXT: buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:56
+; GFX7-NEXT: buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:54
+; GFX7-NEXT: buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:52
+; GFX7-NEXT: buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:50
+; GFX7-NEXT: buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT: buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:34
+; GFX7-NEXT: buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:36
+; GFX7-NEXT: buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:38
+; GFX7-NEXT: buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:40
+; GFX7-NEXT: buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:42
+; GFX7-NEXT: buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:44
+; GFX7-NEXT: buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:46
+; GFX7-NEXT: buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:2
+; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:4
+; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:6
+; GFX7-NEXT: buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:8
+; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10
+; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:12
; GFX7-NEXT: buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14
-; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:18
-; GFX7-NEXT: buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:20
-; GFX7-NEXT: buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:22
-; GFX7-NEXT: buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:24
-; GFX7-NEXT: buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:26
-; GFX7-NEXT: buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:28
-; GFX7-NEXT: buffer_load_ushort v1, v[1:2], s[4:7], 0 addr64 offset:30
+; GFX7-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:18
+; GFX7-NEXT: buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:20
+; GFX7-NEXT: buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:22
+; GFX7-NEXT: buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:24
+; GFX7-NEXT: buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:26
+; GFX7-NEXT: buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:28
+; GFX7-NEXT: buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:30
; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v20
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xfc, v0
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v17
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xfc, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf8, v0
-; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf4, v0
-; GFX7-NEXT: v_add_i32_e32 v22, vcc, 0xd8, v0
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v18
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xf4, v0
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xd8, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0
-; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v23
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xec, v0
-; GFX7-NEXT: s_waitcnt vmcnt(14)
-; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v19
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xec, v0
+; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0xd4, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0
-; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v24
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe4, v0
-; GFX7-NEXT: v_add_i32_e32 v24, vcc, 0xd0, v0
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v20
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xe4, v0
+; GFX7-NEXT: v_add_i32_e32 v20, vcc, 0xd0, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xe0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v25
-; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v21
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xdc, v0
-; GFX7-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v26
-; GFX7-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v21
-; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v27
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xd4, v0
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v22
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v20, v24, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v28
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xcc, v0
-; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc8, v0
-; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc4, v0
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v34
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v21
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v17
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xdc, v0
+; GFX7-NEXT: s_waitcnt vmcnt(14)
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v22
+; GFX7-NEXT: buffer_store_dword v1, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v23
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
+; GFX7-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v1, v20, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xcc, v0
+; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v24
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xc8, v0
+; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xc4, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v31
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xc0, v0
-; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xbc, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v33
-; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v20
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb8, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v32
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb4, v0
-; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xbc, v0
+; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v30
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xb8, v0
+; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xb4, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v29
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xb0, v0
-; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xac, v0
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v31
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa8, v0
-; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa4, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v30
-; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[22:23], v20
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xac, v0
+; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v28
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0xa8, v0
+; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0xa4, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v27
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0xa0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v29
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x9c, v0
-; GFX7-NEXT: buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x98, v0
-; GFX7-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x94, v0
-; GFX7-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x9c, v0
+; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v26
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x98, v0
+; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x94, v0
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v25
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x90, v0
-; GFX7-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v18
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x8c, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v19
-; GFX7-NEXT: buffer_store_dword v21, v18, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x88, v0
-; GFX7-NEXT: buffer_store_dword v20, v18, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v15
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x84, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v17
-; GFX7-NEXT: buffer_store_dword v21, v15, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x80, v0
-; GFX7-NEXT: buffer_store_dword v20, v15, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x8c, v0
+; GFX7-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0x88, v0
+; GFX7-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v16
; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x7c, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX7-NEXT: buffer_store_dword v2, v15, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x84, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v32
+; GFX7-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0x80, v0
; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v14
-; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v16
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
-; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x74, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX7-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x70, v0
-; GFX7-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v12
-; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
-; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x6c, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v13, vcc, 0x68, v0
-; GFX7-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v10
-; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x64, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[1:2], v16
+; GFX7-NEXT: s_waitcnt vmcnt(14)
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v34
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x7c, v0
+; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x74, v0
+; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0
+; GFX7-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v33
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v18, vcc, 0x70, v0
+; GFX7-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v14
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v13
+; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x6c, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v8
-; GFX7-NEXT: v_add_i32_e32 v11, vcc, 0x60, v0
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x5c, v0
-; GFX7-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: buffer_store_dword v16, v8, s[0:3], 0 offen
-; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; GFX7-NEXT: buffer_store_dword v14, v19, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v14, vcc, 0x68, v0
+; GFX7-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v12
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
+; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x64, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: buffer_store_dword v12, v19, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v12, vcc, 0x60, v0
+; GFX7-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v10
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
+; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x5c, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x58, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v7
+; GFX7-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0
+; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0
+; GFX7-NEXT: buffer_store_dword v20, v7, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0
+; GFX7-NEXT: buffer_store_dword v19, v7, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0
+; GFX7-NEXT: buffer_store_dword v5, v19, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0
; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[5:6], v16
-; GFX7-NEXT: v_add_i32_e32 v16, vcc, 0x4c, v0
-; GFX7-NEXT: buffer_store_dword v4, v16, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0
-; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v11
-; GFX7-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
-; GFX7-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 64, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 60, v0
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v10
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX7-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 56, v0
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 64, v0
; GFX7-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 60, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
+; GFX7-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 56, v0
+; GFX7-NEXT: buffer_store_dword v19, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 52, v0
-; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0
; GFX7-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 48, v0
+; GFX7-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 44, v0
-; GFX7-NEXT: buffer_store_dword v10, v3, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 40, v0
; GFX7-NEXT: buffer_store_dword v9, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 40, v0
+; GFX7-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 36, v0
-; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0
; GFX7-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 32, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 28, v0
-; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 24, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
; GFX7-NEXT: buffer_store_dword v14, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 24, v0
+; GFX7-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 20, v0
+; GFX7-NEXT: buffer_store_dword v18, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 16, v0
+; GFX7-NEXT: buffer_store_dword v17, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 12, v0
+; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 8, v0
+; GFX7-NEXT: buffer_store_dword v15, v3, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0
; GFX7-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v2, vcc, 16, v0
-; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 12, v0
-; GFX7-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v0
-; GFX7-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v0
-; GFX7-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen
-; GFX7-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_extload_v32bf16_to_v32f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 2, v1
-; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 2, v1
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 6, v1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 8, v1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 6, v1
; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 8, v1
+; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, 10, v1
; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v13, vcc, 12, v1
; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 14, v1
-; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v21, vcc, 16, v1
-; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v15, vcc, 18, v1
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, 14, v1
; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, 20, v1
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, 16, v1
; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 18, v1
+; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, 20, v1
+; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, 22, v1
; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v25, vcc, 24, v1
@@ -8126,469 +8122,473 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX8-NEXT: v_addc_u32_e32 v30, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v31, vcc, 30, v1
; GFX8-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v33, vcc, 32, v1
+; GFX8-NEXT: v_add_u32_e32 v33, vcc, 34, v1
; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v35, vcc, 34, v1
+; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1
; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc
-; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX8-NEXT: v_add_u32_e32 v37, vcc, 36, v1
-; GFX8-NEXT: flat_load_ushort v43, v[1:2]
+; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1
+; GFX8-NEXT: flat_load_ushort v44, v[1:2]
; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v48, vcc, 38, v1
+; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1
; GFX8-NEXT: v_addc_u32_e32 v49, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v50, vcc, 62, v1
; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v44, v[50:51]
+; GFX8-NEXT: flat_load_ushort v45, v[50:51]
; GFX8-NEXT: v_add_u32_e32 v50, vcc, 60, v1
; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v45, v[50:51]
-; GFX8-NEXT: v_add_u32_e32 v50, vcc, 40, v1
+; GFX8-NEXT: flat_load_ushort v46, v[50:51]
+; GFX8-NEXT: v_add_u32_e32 v50, vcc, 42, v1
; GFX8-NEXT: v_addc_u32_e32 v51, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v52, vcc, 58, v1
; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v46, v[52:53]
-; GFX8-NEXT: v_add_u32_e32 v52, vcc, 42, v1
+; GFX8-NEXT: flat_load_ushort v47, v[52:53]
+; GFX8-NEXT: v_add_u32_e32 v52, vcc, 44, v1
; GFX8-NEXT: v_addc_u32_e32 v53, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v54, vcc, 56, v1
; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v47, v[54:55]
-; GFX8-NEXT: v_add_u32_e32 v54, vcc, 44, v1
+; GFX8-NEXT: flat_load_ushort v56, v[54:55]
+; GFX8-NEXT: v_add_u32_e32 v54, vcc, 46, v1
; GFX8-NEXT: v_addc_u32_e32 v55, vcc, 0, v2, vcc
; GFX8-NEXT: v_add_u32_e32 v39, vcc, 54, v1
; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v56, v[39:40]
-; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1
-; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_ushort v57, v[39:40]
-; GFX8-NEXT: v_add_u32_e32 v39, vcc, 46, v1
+; GFX8-NEXT: v_add_u32_e32 v39, vcc, 52, v1
; GFX8-NEXT: v_addc_u32_e32 v40, vcc, 0, v2, vcc
-; GFX8-NEXT: v_add_u32_e32 v41, vcc, 50, v1
-; GFX8-NEXT: v_addc_u32_e32 v42, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v41, v[41:42]
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v1
-; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT: flat_load_ushort v42, v[9:10]
-; GFX8-NEXT: flat_load_ushort v9, v[35:36]
-; GFX8-NEXT: flat_load_ushort v10, v[37:38]
-; GFX8-NEXT: flat_load_ushort v35, v[48:49]
-; GFX8-NEXT: flat_load_ushort v36, v[50:51]
-; GFX8-NEXT: flat_load_ushort v37, v[52:53]
-; GFX8-NEXT: flat_load_ushort v48, v[54:55]
-; GFX8-NEXT: flat_load_ushort v39, v[39:40]
-; GFX8-NEXT: flat_load_ushort v49, v[1:2]
-; GFX8-NEXT: flat_load_ushort v50, v[3:4]
-; GFX8-NEXT: flat_load_ushort v51, v[5:6]
-; GFX8-NEXT: flat_load_ushort v52, v[7:8]
-; GFX8-NEXT: flat_load_ushort v53, v[11:12]
-; GFX8-NEXT: flat_load_ushort v38, v[13:14]
-; GFX8-NEXT: flat_load_ushort v14, v[17:18]
-; GFX8-NEXT: flat_load_ushort v11, v[21:22]
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v0
-; GFX8-NEXT: flat_load_ushort v15, v[15:16]
-; GFX8-NEXT: flat_load_ushort v13, v[19:20]
-; GFX8-NEXT: flat_load_ushort v8, v[23:24]
-; GFX8-NEXT: flat_load_ushort v6, v[25:26]
-; GFX8-NEXT: flat_load_ushort v5, v[27:28]
-; GFX8-NEXT: flat_load_ushort v7, v[29:30]
-; GFX8-NEXT: flat_load_ushort v12, v[31:32]
-; GFX8-NEXT: flat_load_ushort v16, v[33:34]
-; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xc4, v0
-; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xbc, v0
-; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xb4, v0
-; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xac, v0
-; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0xa4, v0
-; GFX8-NEXT: v_add_u32_e32 v27, vcc, 0x9c, v0
+; GFX8-NEXT: flat_load_ushort v58, v[39:40]
+; GFX8-NEXT: v_add_u32_e32 v40, vcc, 48, v1
+; GFX8-NEXT: v_addc_u32_e32 v41, vcc, 0, v2, vcc
+; GFX8-NEXT: v_add_u32_e32 v42, vcc, 50, v1
+; GFX8-NEXT: v_addc_u32_e32 v43, vcc, 0, v2, vcc
+; GFX8-NEXT: flat_load_ushort v42, v[42:43]
+; GFX8-NEXT: flat_load_ushort v34, v[33:34]
+; GFX8-NEXT: flat_load_ushort v36, v[35:36]
+; GFX8-NEXT: flat_load_ushort v38, v[37:38]
+; GFX8-NEXT: flat_load_ushort v39, v[48:49]
+; GFX8-NEXT: flat_load_ushort v48, v[50:51]
+; GFX8-NEXT: flat_load_ushort v51, v[52:53]
+; GFX8-NEXT: flat_load_ushort v52, v[54:55]
+; GFX8-NEXT: flat_load_ushort v53, v[40:41]
+; GFX8-NEXT: v_add_u32_e32 v49, vcc, 32, v1
+; GFX8-NEXT: v_addc_u32_e32 v50, vcc, 0, v2, vcc
+; GFX8-NEXT: flat_load_ushort v37, v[3:4]
+; GFX8-NEXT: flat_load_ushort v35, v[5:6]
+; GFX8-NEXT: flat_load_ushort v33, v[7:8]
+; GFX8-NEXT: flat_load_ushort v8, v[9:10]
+; GFX8-NEXT: flat_load_ushort v6, v[11:12]
+; GFX8-NEXT: flat_load_ushort v4, v[13:14]
+; GFX8-NEXT: flat_load_ushort v2, v[15:16]
+; GFX8-NEXT: flat_load_ushort v1, v[19:20]
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, 4, v0
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0x7c, v0
; GFX8-NEXT: s_waitcnt vmcnt(14)
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v43
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xfc, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v44
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v45
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xf8, v0
-; GFX8-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xf4, v0
-; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v46
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xf0, v0
-; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xec, v0
-; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xe8, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v47
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX8-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe4, v0
-; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe0, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v56
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
-; GFX8-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xdc, v0
-; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v57
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xd8, v0
-; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xd4, v0
-; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xd0, v0
-; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v41
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xcc, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v42
-; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xc8, v0
-; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v49
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v50
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v44
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v3
+; GFX8-NEXT: flat_load_ushort v3, v[17:18]
+; GFX8-NEXT: flat_load_ushort v5, v[21:22]
+; GFX8-NEXT: flat_load_ushort v7, v[23:24]
+; GFX8-NEXT: flat_load_ushort v9, v[25:26]
+; GFX8-NEXT: flat_load_ushort v10, v[27:28]
+; GFX8-NEXT: flat_load_ushort v11, v[29:30]
+; GFX8-NEXT: flat_load_ushort v12, v[31:32]
+; GFX8-NEXT: flat_load_ushort v13, v[49:50]
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0x84, v0
+; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xfc, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v45
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
+; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v46
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf8, v0
+; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xf4, v0
+; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v47
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xf0, v0
+; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xec, v0
+; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xe8, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v56
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
+; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe4, v0
+; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xe0, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v57
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xdc, v0
+; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v58
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd8, v0
+; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xd4, v0
+; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v42
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xd0, v0
+; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xcc, v0
+; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
; GFX8-NEXT: s_waitcnt vmcnt(14)
-; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v51
-; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v52
-; GFX8-NEXT: buffer_store_dword v4, v18, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xc0, v0
-; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v17
-; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v39
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
-; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v53
-; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v38
-; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xb8, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v53
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc8, v0
+; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xc4, v0
+; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v52
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xc0, v0
+; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xbc, v0
+; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v51
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xb8, v0
+; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb4, v0
+; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v48
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb0, v0
+; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xac, v0
+; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v39
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa8, v0
+; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xa4, v0
+; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v38
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xa0, v0
+; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x9c, v0
+; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v36
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x98, v0
+; GFX8-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x94, v0
+; GFX8-NEXT: buffer_store_dword v17, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x90, v0
+; GFX8-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v34
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x8c, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v37
+; GFX8-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x88, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v16
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v35
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX8-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[17:18], v19
-; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v48
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
-; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xb0, v0
-; GFX8-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[19:20], v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v37
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xa8, v0
-; GFX8-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[21:22], v23
-; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v36
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v23
-; GFX8-NEXT: buffer_store_dword v24, v26, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xa0, v0
-; GFX8-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[23:24], v25
-; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v35
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[25:26], v25
-; GFX8-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v10
-; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0x98, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v11
-; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x94, v0
-; GFX8-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen
-; GFX8-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x90, v0
-; GFX8-NEXT: buffer_store_dword v27, v11, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[25:26], v14
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x8c, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v15
-; GFX8-NEXT: buffer_store_dword v28, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0x88, v0
-; GFX8-NEXT: buffer_store_dword v27, v14, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[14:15], v9
-; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v16
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9
-; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v13
-; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x84, v0
-; GFX8-NEXT: buffer_store_dword v28, v13, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x80, v0
-; GFX8-NEXT: buffer_store_dword v27, v13, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[27:28], v9
-; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v12
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v9
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7c, v0
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; GFX8-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x78, v0
-; GFX8-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen
-; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x74, v0
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GFX8-NEXT: buffer_store_dword v7, v13, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x70, v0
-; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
+; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0x80, v0
+; GFX8-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[16:17], v13
; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x6c, v0
-; GFX8-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x68, v0
-; GFX8-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x64, v0
-; GFX8-NEXT: buffer_store_dword v13, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x60, v0
-; GFX8-NEXT: buffer_store_dword v12, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x5c, v0
-; GFX8-NEXT: buffer_store_dword v9, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x58, v0
-; GFX8-NEXT: buffer_store_dword v8, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v33
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0x78, v0
+; GFX8-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[12:13], v18
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v11
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x74, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0x70, v0
+; GFX8-NEXT: buffer_store_dword v18, v11, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[18:19], v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v8
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x6c, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x68, v0
+; GFX8-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x64, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x60, v0
+; GFX8-NEXT: buffer_store_dword v8, v6, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x5c, v0
+; GFX8-NEXT: buffer_store_dword v7, v4, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x58, v0
+; GFX8-NEXT: buffer_store_dword v6, v4, s[0:3], 0 offen
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v2
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x54, v0
-; GFX8-NEXT: buffer_store_dword v28, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x50, v0
-; GFX8-NEXT: buffer_store_dword v27, v5, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0
+; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[1:2], v3
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[3:4], v4
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0
-; GFX8-NEXT: buffer_store_dword v15, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x48, v0
-; GFX8-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x44, v0
-; GFX8-NEXT: buffer_store_dword v11, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 64, v0
-; GFX8-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 60, v0
-; GFX8-NEXT: buffer_store_dword v26, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 56, v0
-; GFX8-NEXT: buffer_store_dword v25, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 52, v0
-; GFX8-NEXT: buffer_store_dword v24, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 48, v0
-; GFX8-NEXT: buffer_store_dword v23, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 44, v0
-; GFX8-NEXT: buffer_store_dword v22, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 40, v0
-; GFX8-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 36, v0
-; GFX8-NEXT: buffer_store_dword v20, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 32, v0
-; GFX8-NEXT: buffer_store_dword v19, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 28, v0
-; GFX8-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 24, v0
-; GFX8-NEXT: buffer_store_dword v17, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 20, v0
-; GFX8-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v0
-; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 12, v0
+; GFX8-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0
+; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x44, v0
+; GFX8-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 64, v0
+; GFX8-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 60, v0
+; GFX8-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0
+; GFX8-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 52, v0
+; GFX8-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 48, v0
+; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 44, v0
+; GFX8-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 40, v0
+; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 36, v0
+; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0
+; GFX8-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0
+; GFX8-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 24, v0
+; GFX8-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 20, v0
+; GFX8-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0
+; GFX8-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
-; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen
+; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen
+; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_extload_v32bf16_to_v32f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:62
-; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:60
-; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:58
-; GFX9-NEXT: global_load_ushort v25, v[1:2], off offset:56
-; GFX9-NEXT: global_load_ushort v26, v[1:2], off offset:54
-; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:52
-; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:50
-; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:48
-; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:46
-; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:44
-; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:42
-; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:40
-; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:38
-; GFX9-NEXT: global_load_ushort v19, v[1:2], off
-; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:36
-; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:2
-; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:4
-; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:34
-; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:32
-; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:6
-; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:8
-; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:30
+; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:62
+; GFX9-NEXT: global_load_ushort v10, v[1:2], off offset:60
+; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:58
+; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:56
+; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:54
+; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:52
+; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:50
+; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:48
+; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:46
+; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:44
+; GFX9-NEXT: global_load_ushort v19, v[1:2], off offset:42
+; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:40
+; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:38
+; GFX9-NEXT: global_load_ushort v22, v[1:2], off offset:36
+; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:34
+; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:32
+; GFX9-NEXT: global_load_ushort v25, v[1:2], off
+; GFX9-NEXT: global_load_ushort v26, v[1:2], off offset:2
+; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:30
; GFX9-NEXT: global_load_ushort v3, v[1:2], off offset:16
; GFX9-NEXT: global_load_ushort v4, v[1:2], off offset:18
; GFX9-NEXT: global_load_ushort v5, v[1:2], off offset:20
; GFX9-NEXT: global_load_ushort v6, v[1:2], off offset:22
-; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:24
-; GFX9-NEXT: global_load_ushort v10, v[1:2], off offset:26
-; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:28
-; GFX9-NEXT: global_load_ushort v9, v[1:2], off offset:10
+; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:24
+; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:26
+; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:28
+; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:4
+; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:6
+; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:8
+; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:10
; GFX9-NEXT: global_load_ushort v7, v[1:2], off offset:12
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_ushort v1, v[1:2], off offset:14
; GFX9-NEXT: s_waitcnt vmcnt(31)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v21
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v8
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v10
; GFX9-NEXT: s_waitcnt vmcnt(28)
-; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v25
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:252
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:248
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v24
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:252
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:248
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v11
; GFX9-NEXT: s_waitcnt vmcnt(29)
-; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v26
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:244
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:240
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v13
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:244
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:240
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v27
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:236
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:232
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v23
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[23:24], v24
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:236
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:232
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
; GFX9-NEXT: s_waitcnt vmcnt(31)
-; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v28
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v15
; GFX9-NEXT: s_waitcnt vmcnt(30)
-; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v29
-; GFX9-NEXT: s_waitcnt vmcnt(29)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v30
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v25
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[25:26], v26
-; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:220
-; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:216
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[23:24], v27
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[27:28], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v16
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:228
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:224
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
+; GFX9-NEXT: s_waitcnt vmcnt(31)
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v13
+; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:220
+; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:216
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v14
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
+; GFX9-NEXT: s_waitcnt vmcnt(32)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v18
+; GFX9-NEXT: s_waitcnt vmcnt(30)
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v20
; GFX9-NEXT: s_waitcnt vmcnt(28)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v19
-; GFX9-NEXT: s_waitcnt vmcnt(27)
-; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v20
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
-; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v31
-; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v32
-; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v33
-; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v34
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:212
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:208
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v29
-; GFX9-NEXT: s_waitcnt vmcnt(26)
-; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[29:30], v30
-; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:204
-; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:200
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[25:26], v31
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[31:32], v32
-; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:196
-; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:192
-; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:188
-; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:184
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:180
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:176
-; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:172
-; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:168
-; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:164
-; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:160
-; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:156
-; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:152
-; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:212
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:208
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v19
+; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v21
+; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:204
+; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:200
+; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:196
+; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:192
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v20
+; GFX9-NEXT: s_waitcnt vmcnt(33)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v23
; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
-; GFX9-NEXT: s_waitcnt vmcnt(39)
-; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148
-; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v18
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v19
+; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:188
+; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:184
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:180
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:176
+; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:172
+; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:168
+; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:164
+; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:160
+; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:156
+; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:152
+; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:148
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:144
+; GFX9-NEXT: s_waitcnt vmcnt(44)
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v24
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:140
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:136
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10
+; GFX9-NEXT: s_waitcnt vmcnt(43)
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v27
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:132
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
+; GFX9-NEXT: s_waitcnt vmcnt(38)
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v30
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v29
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:112
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v25
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v26
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[12:13], v2
+; GFX9-NEXT: s_waitcnt vmcnt(41)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v31
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v28
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v2
; GFX9-NEXT: s_waitcnt vmcnt(40)
-; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v13
-; GFX9-NEXT: s_waitcnt vmcnt(39)
-; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v14
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v11
-; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:140
-; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:136
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v32
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:108
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:104
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v18
; GFX9-NEXT: v_cvt_f64_f32_e32 v[16:17], v2
+; GFX9-NEXT: s_waitcnt vmcnt(41)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v33
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v2
; GFX9-NEXT: s_waitcnt vmcnt(40)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v15
-; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:132
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[14:15], v2
-; GFX9-NEXT: s_waitcnt vmcnt(34)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v12
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v10
-; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT: s_waitcnt vmcnt(38)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v9
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
-; GFX9-NEXT: s_waitcnt vmcnt(39)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v34
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT: s_waitcnt vmcnt(38)
-; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v5
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v2
+; GFX9-NEXT: s_waitcnt vmcnt(41)
+; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v7
+; GFX9-NEXT: s_waitcnt vmcnt(40)
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88
; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
-; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v5
-; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v18
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[18:19], v21
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v22
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[20:21], v20
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[22:23], v23
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v12
-; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen
+; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84
+; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v3
+; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v22
+; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52
+; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
+; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:40
+; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:36
+; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:32
+; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -8612,179 +8612,177 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX10-NEXT: global_load_ushort v16, v[1:2], off offset:26
; GFX10-NEXT: global_load_ushort v17, v[1:2], off offset:28
; GFX10-NEXT: global_load_ushort v18, v[1:2], off offset:30
-; GFX10-NEXT: global_load_ushort v19, v[1:2], off offset:32
-; GFX10-NEXT: global_load_ushort v20, v[1:2], off offset:34
-; GFX10-NEXT: global_load_ushort v21, v[1:2], off offset:36
-; GFX10-NEXT: global_load_ushort v22, v[1:2], off offset:38
-; GFX10-NEXT: global_load_ushort v23, v[1:2], off offset:40
-; GFX10-NEXT: global_load_ushort v24, v[1:2], off offset:42
-; GFX10-NEXT: global_load_ushort v25, v[1:2], off offset:44
-; GFX10-NEXT: global_load_ushort v26, v[1:2], off offset:46
-; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:48
-; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:62
-; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:50
-; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:52
-; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:54
-; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:60
-; GFX10-NEXT: global_load_ushort v33, v[1:2], off offset:56
-; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:58
+; GFX10-NEXT: global_load_ushort v19, v[1:2], off offset:62
+; GFX10-NEXT: global_load_ushort v20, v[1:2], off offset:32
+; GFX10-NEXT: global_load_ushort v21, v[1:2], off offset:34
+; GFX10-NEXT: global_load_ushort v22, v[1:2], off offset:36
+; GFX10-NEXT: global_load_ushort v23, v[1:2], off offset:60
+; GFX10-NEXT: global_load_ushort v24, v[1:2], off offset:38
+; GFX10-NEXT: global_load_ushort v25, v[1:2], off offset:40
+; GFX10-NEXT: global_load_ushort v26, v[1:2], off offset:58
+; GFX10-NEXT: global_load_ushort v27, v[1:2], off offset:42
+; GFX10-NEXT: global_load_ushort v28, v[1:2], off offset:44
+; GFX10-NEXT: global_load_ushort v29, v[1:2], off offset:56
+; GFX10-NEXT: global_load_ushort v30, v[1:2], off offset:46
+; GFX10-NEXT: global_load_ushort v31, v[1:2], off offset:48
+; GFX10-NEXT: global_load_ushort v32, v[1:2], off offset:54
+; GFX10-NEXT: global_load_ushort v33, v[1:2], off offset:50
+; GFX10-NEXT: global_load_ushort v34, v[1:2], off offset:52
; GFX10-NEXT: s_waitcnt vmcnt(31)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v3
; GFX10-NEXT: s_waitcnt vmcnt(30)
-; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v4
; GFX10-NEXT: s_waitcnt vmcnt(29)
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v5
; GFX10-NEXT: s_waitcnt vmcnt(28)
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v6
; GFX10-NEXT: s_waitcnt vmcnt(27)
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v7
; GFX10-NEXT: s_waitcnt vmcnt(26)
-; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v8
; GFX10-NEXT: s_waitcnt vmcnt(25)
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v9
; GFX10-NEXT: s_waitcnt vmcnt(24)
-; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
; GFX10-NEXT: s_waitcnt vmcnt(23)
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v11
; GFX10-NEXT: s_waitcnt vmcnt(22)
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v12
; GFX10-NEXT: s_waitcnt vmcnt(21)
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v13
; GFX10-NEXT: s_waitcnt vmcnt(20)
-; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v14
-; GFX10-NEXT: s_waitcnt vmcnt(19)
-; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v15
-; GFX10-NEXT: s_waitcnt vmcnt(18)
-; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v16
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v37
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v38
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v14
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v35
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v36
+; GFX10-NEXT: s_waitcnt vmcnt(17)
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v17
+; GFX10-NEXT: s_waitcnt vmcnt(16)
+; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v18
; GFX10-NEXT: s_waitcnt vmcnt(15)
-; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v19
; GFX10-NEXT: s_waitcnt vmcnt(14)
; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v20
; GFX10-NEXT: s_waitcnt vmcnt(13)
-; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v21
; GFX10-NEXT: s_waitcnt vmcnt(12)
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v22
; GFX10-NEXT: s_waitcnt vmcnt(11)
-; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v23
-; GFX10-NEXT: s_waitcnt vmcnt(10)
-; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v23
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
; GFX10-NEXT: s_waitcnt vmcnt(9)
-; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v71, 16, v25
; GFX10-NEXT: s_waitcnt vmcnt(8)
-; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v26
; GFX10-NEXT: s_waitcnt vmcnt(7)
-; GFX10-NEXT: v_lshlrev_b32_e32 v82, 16, v27
-; GFX10-NEXT: s_waitcnt vmcnt(6)
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v80, 16, v27
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
; GFX10-NEXT: s_waitcnt vmcnt(5)
-; GFX10-NEXT: v_lshlrev_b32_e32 v83, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v29
; GFX10-NEXT: s_waitcnt vmcnt(4)
-; GFX10-NEXT: v_lshlrev_b32_e32 v84, 16, v30
-; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v31
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v30
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v32
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v32
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v33
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v34
-; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v33
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v29
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v84
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v13
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v50
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v51
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v82
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v52
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v53
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[52:53], v80
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v35
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[9:10], v36
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v48
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v49
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v54
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v55
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[54:55], v70
-; GFX10-NEXT: v_lshlrev_b32_e32 v69, 16, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v34
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v31
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v81, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v70, 16, v24
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[31:32], v71
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[35:36], v68
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v16
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[33:34], v70
+; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v15
; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252
; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:248
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v83
-; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v17
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
-; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:244
-; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:240
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v81
-; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:236
-; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:232
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[13:14], v71
-; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
-; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v65
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[64:65], v64
-; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:220
-; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:216
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v67
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[66:67], v66
-; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:212
-; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:208
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v69
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v39
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[68:69], v68
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v23
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[15:16], v37
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[17:18], v38
+; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244
+; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:240
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v25
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[37:38], v66
+; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236
+; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:232
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v27
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[23:24], v48
+; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228
+; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:224
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v81
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[25:26], v49
+; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220
+; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v80
+; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:212
+; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:208
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[19:20], v69
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[48:49], v64
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[27:28], v50
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[29:30], v51
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[50:51], v54
; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:204
; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:200
-; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:196
-; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:192
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[1:2], v67
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[21:22], v39
+; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:196
+; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:192
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[3:4], v65
; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:188
; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:184
-; GFX10-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:180
-; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:176
-; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:172
-; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:168
-; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:164
-; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:160
-; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:156
-; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:152
-; GFX10-NEXT: buffer_store_dword v65, v0, s[0:3], 0 offen offset:148
-; GFX10-NEXT: buffer_store_dword v64, v0, s[0:3], 0 offen offset:144
-; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:140
-; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:136
-; GFX10-NEXT: buffer_store_dword v67, v0, s[0:3], 0 offen offset:132
-; GFX10-NEXT: buffer_store_dword v66, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v55
+; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:180
+; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:176
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v53
+; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:172
+; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:168
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[11:12], v52
+; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:164
+; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:160
+; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:156
+; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:152
+; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:148
+; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:144
+; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:140
+; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:136
+; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:132
+; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:128
; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:124
; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:120
-; GFX10-NEXT: buffer_store_dword v69, v0, s[0:3], 0 offen offset:116
-; GFX10-NEXT: buffer_store_dword v68, v0, s[0:3], 0 offen offset:112
+; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:116
+; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:112
; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:108
; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:104
-; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:100
-; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:96
-; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:92
-; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:88
-; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:84
-; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:80
-; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:76
-; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:72
-; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:68
-; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:64
-; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:60
-; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:56
-; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:52
-; GFX10-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:48
-; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:44
-; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:40
-; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:36
-; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:32
-; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:28
-; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:24
-; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:20
-; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:16
-; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:12
-; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:8
-; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
-; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX10-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100
+; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96
+; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:92
+; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:88
+; GFX10-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:84
+; GFX10-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:80
+; GFX10-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:76
+; GFX10-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:72
+; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:68
+; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:64
+; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:60
+; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:56
+; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:52
+; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:48
+; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44
+; GFX10-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40
+; GFX10-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36
+; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32
+; GFX10-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28
+; GFX10-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24
+; GFX10-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20
+; GFX10-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
+; GFX10-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12
+; GFX10-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_extload_v32bf16_to_v32f64:
@@ -10059,55 +10057,47 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_add_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_add_f32_e32 v11, v11, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_add_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_add_f32_e32 v9, v9, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_add_f32_e32 v8, v8, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_add_f32_e32 v7, v7, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_add_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_add_f32_e32 v5, v5, v21
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_add_f32_e32 v11, v11, v27
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_add_f32_e32 v4, v4, v20
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
@@ -10116,6 +10106,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_add_f32_e32 v10, v10, v26
+; GCN-NEXT: v_add_f32_e32 v9, v9, v25
+; GCN-NEXT: v_add_f32_e32 v8, v8, v24
+; GCN-NEXT: v_add_f32_e32 v7, v7, v23
+; GCN-NEXT: v_add_f32_e32 v6, v6, v22
+; GCN-NEXT: v_add_f32_e32 v5, v5, v21
+; GCN-NEXT: v_add_f32_e32 v4, v4, v20
; GCN-NEXT: v_add_f32_e32 v3, v3, v19
; GCN-NEXT: v_add_f32_e32 v2, v2, v18
; GCN-NEXT: v_add_f32_e32 v1, v1, v17
@@ -10135,7 +10133,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_add_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
@@ -10145,20 +10143,22 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_fadd_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_add_f32_e32 v11, v11, v27
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v22
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
@@ -10169,25 +10169,24 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
@@ -10212,7 +10211,6 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_add_f32_e32 v14, v14, v30
; GFX7-NEXT: v_add_f32_e32 v13, v13, v29
; GFX7-NEXT: v_add_f32_e32 v12, v12, v28
-; GFX7-NEXT: v_add_f32_e32 v11, v11, v27
; GFX7-NEXT: v_add_f32_e32 v10, v10, v26
; GFX7-NEXT: v_add_f32_e32 v9, v9, v25
; GFX7-NEXT: v_add_f32_e32 v8, v8, v24
@@ -10231,7 +10229,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_add_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
@@ -11689,10 +11687,10 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
@@ -11995,278 +11993,278 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX10-NEXT: v_add_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_add_f32_e32 v11, v11, v27
-; GFX10-NEXT: v_add_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_add_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_add_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
; GFX10-NEXT: v_add_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
+; GFX10-NEXT: v_add_f32_e32 v39, v48, v39
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
+; GFX10-NEXT: v_add_f32_e32 v49, v50, v49
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
+; GFX10-NEXT: v_add_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_add_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
+; GFX10-NEXT: v_add_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_add_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
+; GFX10-NEXT: v_add_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GFX10-NEXT: v_add_f32_e32 v27, v50, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
+; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_add_f32_e32 v8, v8, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
+; GFX10-NEXT: v_add_f32_e32 v29, v38, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
+; GFX10-NEXT: v_add_f32_e32 v7, v7, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
+; GFX10-NEXT: v_add_f32_e32 v28, v48, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_add_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
+; GFX10-NEXT: v_add_f32_e32 v34, v34, v51
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_add_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_add_f32_e32 v25, v54, v53
-; GFX10-NEXT: v_add_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_add_f32_e32 v24, v64, v55
-; GFX10-NEXT: v_add_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_add_f32_e32 v23, v66, v65
-; GFX10-NEXT: v_add_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_add_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
-; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
-; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_add_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT: v_add_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
+; GFX10-NEXT: v_add_f32_e32 v30, v36, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_add_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_add_f32_e32 v18, v27, v48
+; GFX10-NEXT: v_add_f32_e32 v18, v48, v23
; GFX10-NEXT: v_add_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_add_f32_e32 v17, v26, v50
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
-; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
-; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
-; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
-; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
-; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
+; GFX10-NEXT: v_add_f32_e32 v17, v50, v22
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
+; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_add_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT: v_add_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_add_f32_e32 v20, v36, v25
; GFX10-NEXT: v_add_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_add_f32_e32 v19, v28, v38
-; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
-; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
-; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
-; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
-; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
-; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
-; GFX10-NEXT: v_add_f32_e32 v51, v52, v51
+; GFX10-NEXT: v_add_f32_e32 v19, v38, v24
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
+; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
+; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX10-NEXT: v_add_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_add_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_add_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_add_f32_e32 v20, v29, v36
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
-; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
-; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
-; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
-; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
-; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
-; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
-; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
-; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
-; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
-; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
-; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
-; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
-; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
-; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
+; GFX10-NEXT: v_add_f32_e32 v21, v51, v26
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
+; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
+; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
+; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
+; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
+; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
+; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
+; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
+; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
+; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
+; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
+; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
+; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
+; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
+; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
-; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
-; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
-; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
-; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
-; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
-; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
-; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
-; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
-; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
-; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
-; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
-; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
-; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
-; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
-; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
+; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
-; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
-; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
-; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
-; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
-; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
-; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
-; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
-; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
-; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
-; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
-; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
-; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
-; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
+; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
+; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
+; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
+; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; GFX10-NEXT: v_add_f32_e32 v17, v31, v17
; GFX10-NEXT: v_add_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
-; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
+; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
+; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -14496,55 +14494,47 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_mul_f32_e32 v11, v11, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_mul_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_mul_f32_e32 v9, v9, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_mul_f32_e32 v8, v8, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_mul_f32_e32 v7, v7, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_mul_f32_e32 v5, v5, v21
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_mul_f32_e32 v11, v11, v27
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_mul_f32_e32 v4, v4, v20
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
@@ -14553,6 +14543,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_mul_f32_e32 v10, v10, v26
+; GCN-NEXT: v_mul_f32_e32 v9, v9, v25
+; GCN-NEXT: v_mul_f32_e32 v8, v8, v24
+; GCN-NEXT: v_mul_f32_e32 v7, v7, v23
+; GCN-NEXT: v_mul_f32_e32 v6, v6, v22
+; GCN-NEXT: v_mul_f32_e32 v5, v5, v21
+; GCN-NEXT: v_mul_f32_e32 v4, v4, v20
; GCN-NEXT: v_mul_f32_e32 v3, v3, v19
; GCN-NEXT: v_mul_f32_e32 v2, v2, v18
; GCN-NEXT: v_mul_f32_e32 v1, v1, v17
@@ -14572,7 +14570,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_mul_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
@@ -14582,20 +14580,22 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_fmul_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
@@ -14606,25 +14606,24 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
@@ -14649,7 +14648,6 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v14, v14, v30
; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29
; GFX7-NEXT: v_mul_f32_e32 v12, v12, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27
; GFX7-NEXT: v_mul_f32_e32 v10, v10, v26
; GFX7-NEXT: v_mul_f32_e32 v9, v9, v25
; GFX7-NEXT: v_mul_f32_e32 v8, v8, v24
@@ -14668,7 +14666,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_mul_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
@@ -16126,10 +16124,10 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
@@ -16432,278 +16430,278 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27
-; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
; GFX10-NEXT: v_mul_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
+; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
+; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
+; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
+; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
+; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GFX10-NEXT: v_mul_f32_e32 v27, v50, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
+; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
+; GFX10-NEXT: v_mul_f32_e32 v29, v38, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
+; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
+; GFX10-NEXT: v_mul_f32_e32 v28, v48, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
+; GFX10-NEXT: v_mul_f32_e32 v34, v34, v51
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_mul_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_mul_f32_e32 v25, v54, v53
-; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_mul_f32_e32 v24, v64, v55
-; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_mul_f32_e32 v23, v66, v65
-; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_mul_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
-; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
-; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
+; GFX10-NEXT: v_mul_f32_e32 v30, v36, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_mul_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_mul_f32_e32 v18, v27, v48
+; GFX10-NEXT: v_mul_f32_e32 v18, v48, v23
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_mul_f32_e32 v17, v26, v50
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
-; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
-; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
-; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
-; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
-; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
+; GFX10-NEXT: v_mul_f32_e32 v17, v50, v22
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
+; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_mul_f32_e32 v20, v36, v25
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_mul_f32_e32 v19, v28, v38
-; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
-; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
-; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
-; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
-; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
-; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
-; GFX10-NEXT: v_mul_f32_e32 v51, v52, v51
+; GFX10-NEXT: v_mul_f32_e32 v19, v38, v24
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
+; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
+; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_mul_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_mul_f32_e32 v20, v29, v36
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
-; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
-; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
-; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
-; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
-; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
-; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
-; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
-; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
-; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
-; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
-; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
-; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
-; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
-; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
+; GFX10-NEXT: v_mul_f32_e32 v21, v51, v26
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
+; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
+; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
+; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
+; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
+; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
+; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
+; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
+; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
+; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
+; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
+; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
+; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
+; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
+; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
-; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
-; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
-; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
-; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
-; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
-; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
-; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
-; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
-; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
-; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
-; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
-; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
-; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
-; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
-; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
+; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
-; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
-; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
-; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
-; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
-; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
-; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
-; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
-; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
-; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
-; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
-; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
-; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
-; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
+; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
+; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
+; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
+; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; GFX10-NEXT: v_mul_f32_e32 v17, v31, v17
; GFX10-NEXT: v_mul_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
-; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
+; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
+; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -18574,55 +18572,47 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_min_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_min_f32_e32 v11, v11, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_min_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_min_f32_e32 v9, v9, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_min_f32_e32 v8, v8, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_min_f32_e32 v7, v7, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_min_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_min_f32_e32 v5, v5, v21
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_min_f32_e32 v11, v11, v27
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_min_f32_e32 v4, v4, v20
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
@@ -18631,6 +18621,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_min_f32_e32 v10, v10, v26
+; GCN-NEXT: v_min_f32_e32 v9, v9, v25
+; GCN-NEXT: v_min_f32_e32 v8, v8, v24
+; GCN-NEXT: v_min_f32_e32 v7, v7, v23
+; GCN-NEXT: v_min_f32_e32 v6, v6, v22
+; GCN-NEXT: v_min_f32_e32 v5, v5, v21
+; GCN-NEXT: v_min_f32_e32 v4, v4, v20
; GCN-NEXT: v_min_f32_e32 v3, v3, v19
; GCN-NEXT: v_min_f32_e32 v2, v2, v18
; GCN-NEXT: v_min_f32_e32 v1, v1, v17
@@ -18650,7 +18648,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_min_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
@@ -18660,20 +18658,22 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_minnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
@@ -18684,25 +18684,24 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
@@ -18727,7 +18726,6 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_min_f32_e32 v14, v14, v30
; GFX7-NEXT: v_min_f32_e32 v13, v13, v29
; GFX7-NEXT: v_min_f32_e32 v12, v12, v28
-; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
; GFX7-NEXT: v_min_f32_e32 v10, v10, v26
; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
; GFX7-NEXT: v_min_f32_e32 v8, v8, v24
@@ -18746,7 +18744,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_min_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
@@ -20204,10 +20202,10 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
@@ -20510,278 +20508,278 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX10-NEXT: v_min_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_min_f32_e32 v11, v11, v27
-; GFX10-NEXT: v_min_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_min_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
; GFX10-NEXT: v_min_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
+; GFX10-NEXT: v_min_f32_e32 v39, v48, v39
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
+; GFX10-NEXT: v_min_f32_e32 v49, v50, v49
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
+; GFX10-NEXT: v_min_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_min_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
+; GFX10-NEXT: v_min_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
+; GFX10-NEXT: v_min_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GFX10-NEXT: v_min_f32_e32 v27, v50, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
+; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_min_f32_e32 v8, v8, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
+; GFX10-NEXT: v_min_f32_e32 v29, v38, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
+; GFX10-NEXT: v_min_f32_e32 v7, v7, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
+; GFX10-NEXT: v_min_f32_e32 v28, v48, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
+; GFX10-NEXT: v_min_f32_e32 v34, v34, v51
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_min_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_min_f32_e32 v25, v54, v53
-; GFX10-NEXT: v_min_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_min_f32_e32 v24, v64, v55
-; GFX10-NEXT: v_min_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_min_f32_e32 v23, v66, v65
-; GFX10-NEXT: v_min_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_min_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
-; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
-; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_min_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT: v_min_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
+; GFX10-NEXT: v_min_f32_e32 v30, v36, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_min_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_min_f32_e32 v18, v27, v48
+; GFX10-NEXT: v_min_f32_e32 v18, v48, v23
; GFX10-NEXT: v_min_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_min_f32_e32 v17, v26, v50
-; GFX10-NEXT: v_min_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
-; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
-; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
-; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
-; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
-; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
+; GFX10-NEXT: v_min_f32_e32 v17, v50, v22
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
+; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_min_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT: v_min_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_min_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_min_f32_e32 v20, v36, v25
; GFX10-NEXT: v_min_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_min_f32_e32 v19, v28, v38
-; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
-; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
-; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
-; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
-; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
-; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
-; GFX10-NEXT: v_min_f32_e32 v51, v52, v51
+; GFX10-NEXT: v_min_f32_e32 v19, v38, v24
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
+; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
+; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX10-NEXT: v_min_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_min_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_min_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_min_f32_e32 v20, v29, v36
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
-; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
-; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
-; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
-; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
-; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
-; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
-; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
-; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
-; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
-; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
-; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
-; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
-; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
-; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
+; GFX10-NEXT: v_min_f32_e32 v21, v51, v26
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
+; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
+; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
+; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
+; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
+; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
+; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
+; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
+; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
+; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
+; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
+; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
+; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
+; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
+; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
-; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
-; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
-; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
-; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
-; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
-; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
-; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
-; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
-; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
-; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
-; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
-; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
-; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
-; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
-; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
+; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
-; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
-; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
-; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
-; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
-; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
-; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
-; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
-; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
-; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
-; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
-; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
-; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
-; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
+; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
+; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
+; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
+; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; GFX10-NEXT: v_min_f32_e32 v17, v31, v17
; GFX10-NEXT: v_min_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
-; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
+; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
+; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -22193,55 +22191,47 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_max_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_max_f32_e32 v11, v11, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_max_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_max_f32_e32 v9, v9, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_max_f32_e32 v8, v8, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_max_f32_e32 v7, v7, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_max_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_max_f32_e32 v5, v5, v21
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_max_f32_e32 v11, v11, v27
+; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_max_f32_e32 v4, v4, v20
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
@@ -22250,6 +22240,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_max_f32_e32 v10, v10, v26
+; GCN-NEXT: v_max_f32_e32 v9, v9, v25
+; GCN-NEXT: v_max_f32_e32 v8, v8, v24
+; GCN-NEXT: v_max_f32_e32 v7, v7, v23
+; GCN-NEXT: v_max_f32_e32 v6, v6, v22
+; GCN-NEXT: v_max_f32_e32 v5, v5, v21
+; GCN-NEXT: v_max_f32_e32 v4, v4, v20
; GCN-NEXT: v_max_f32_e32 v3, v3, v19
; GCN-NEXT: v_max_f32_e32 v2, v2, v18
; GCN-NEXT: v_max_f32_e32 v1, v1, v17
@@ -22269,7 +22267,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_max_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
@@ -22279,20 +22277,22 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_maxnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
-; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
@@ -22303,25 +22303,24 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
@@ -22346,7 +22345,6 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_max_f32_e32 v14, v14, v30
; GFX7-NEXT: v_max_f32_e32 v13, v13, v29
; GFX7-NEXT: v_max_f32_e32 v12, v12, v28
-; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
; GFX7-NEXT: v_max_f32_e32 v10, v10, v26
; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
; GFX7-NEXT: v_max_f32_e32 v8, v8, v24
@@ -22365,7 +22363,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: v_max_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
@@ -23823,10 +23821,10 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v30
; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; GFX8-NEXT: v_alignbit_b32 v0, v0, v17, 16
; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16
; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16
@@ -24129,278 +24127,278 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v27
; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v11
; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26
; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v12
-; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
-; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v9
-; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v24
-; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v8
-; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v23
-; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v7
-; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v22
-; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v67, 16, v21
-; GFX10-NEXT: v_lshlrev_b32_e32 v68, 16, v5
-; GFX10-NEXT: v_max_f32_e32 v39, v48, v39
-; GFX10-NEXT: v_max_f32_e32 v11, v11, v27
-; GFX10-NEXT: v_max_f32_e32 v49, v50, v49
-; GFX10-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v29
; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v13
; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX10-NEXT: v_max_f32_e32 v37, v38, v37
-; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v18
; GFX10-NEXT: v_max_f32_e32 v12, v12, v28
-; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v22
+; GFX10-NEXT: v_max_f32_e32 v39, v48, v39
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v21
+; GFX10-NEXT: v_max_f32_e32 v49, v50, v49
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v5
+; GFX10-NEXT: v_max_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_max_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v24
+; GFX10-NEXT: v_max_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v8
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v23
+; GFX10-NEXT: v_max_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GFX10-NEXT: v_max_f32_e32 v27, v50, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v9
+; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v18
+; GFX10-NEXT: v_max_f32_e32 v29, v38, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v2
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v17
-; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v17
+; GFX10-NEXT: v_max_f32_e32 v28, v48, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v1
; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v16
-; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v20
+; GFX10-NEXT: v_max_f32_e32 v34, v34, v51
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_max_f32_e32 v9, v9, v25
-; GFX10-NEXT: v_max_f32_e32 v25, v54, v53
-; GFX10-NEXT: v_max_f32_e32 v8, v8, v24
-; GFX10-NEXT: v_max_f32_e32 v24, v64, v55
-; GFX10-NEXT: v_max_f32_e32 v7, v7, v23
-; GFX10-NEXT: v_max_f32_e32 v23, v66, v65
-; GFX10-NEXT: v_max_f32_e32 v6, v6, v22
-; GFX10-NEXT: v_max_f32_e32 v22, v68, v67
-; GFX10-NEXT: v_bfe_u32 v53, v39, 16, 1
-; GFX10-NEXT: v_bfe_u32 v55, v11, 16, 1
-; GFX10-NEXT: v_bfe_u32 v65, v49, 16, 1
-; GFX10-NEXT: v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v14
-; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT: v_max_f32_e32 v35, v36, v35
-; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v19
-; GFX10-NEXT: v_max_f32_e32 v13, v13, v29
-; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v19
+; GFX10-NEXT: v_max_f32_e32 v30, v36, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v3
; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_max_f32_e32 v2, v2, v18
-; GFX10-NEXT: v_max_f32_e32 v18, v27, v48
+; GFX10-NEXT: v_max_f32_e32 v18, v48, v23
; GFX10-NEXT: v_max_f32_e32 v1, v1, v17
-; GFX10-NEXT: v_max_f32_e32 v17, v26, v50
-; GFX10-NEXT: v_max_f32_e32 v0, v0, v16
-; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v39
-; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v11
-; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v49
-; GFX10-NEXT: v_or_b32_e32 v68, 0x400000, v10
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v39, v39
-; GFX10-NEXT: v_add3_u32 v39, v53, v39, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v11, v11
-; GFX10-NEXT: v_add3_u32 v11, v55, v11, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v49, v49
-; GFX10-NEXT: v_add3_u32 v49, v65, v49, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v10, v10
-; GFX10-NEXT: v_add3_u32 v10, v67, v10, 0x7fff
+; GFX10-NEXT: v_max_f32_e32 v17, v50, v22
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v33
+; GFX10-NEXT: v_bfe_u32 v23, v14, 16, 1
+; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_max_f32_e32 v33, v34, v33
-; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v20
-; GFX10-NEXT: v_max_f32_e32 v14, v14, v30
-; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_max_f32_e32 v20, v36, v25
; GFX10-NEXT: v_max_f32_e32 v3, v3, v19
-; GFX10-NEXT: v_max_f32_e32 v19, v28, v38
-; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
-; GFX10-NEXT: v_bfe_u32 v50, v12, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v39, v39, v54, s9
-; GFX10-NEXT: v_bfe_u32 v54, v18, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v64, s10
-; GFX10-NEXT: v_bfe_u32 v64, v1, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v49, v49, v66, s11
-; GFX10-NEXT: v_bfe_u32 v66, v17, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v68, s12
-; GFX10-NEXT: v_bfe_u32 v68, v0, 16, 1
-; GFX10-NEXT: v_max_f32_e32 v51, v52, v51
+; GFX10-NEXT: v_max_f32_e32 v19, v38, v24
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v14
+; GFX10-NEXT: v_bfe_u32 v25, v35, 16, 1
+; GFX10-NEXT: v_add3_u32 v23, v23, v14, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v22, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX10-NEXT: v_max_f32_e32 v5, v5, v21
-; GFX10-NEXT: v_max_f32_e32 v21, v30, v34
-; GFX10-NEXT: v_max_f32_e32 v4, v4, v20
-; GFX10-NEXT: v_max_f32_e32 v20, v29, v36
-; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1
-; GFX10-NEXT: v_bfe_u32 v27, v14, 16, 1
-; GFX10-NEXT: v_bfe_u32 v29, v35, 16, 1
-; GFX10-NEXT: v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v37
-; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v37, v37
-; GFX10-NEXT: v_add3_u32 v37, v38, v37, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v12, v12
-; GFX10-NEXT: v_add3_u32 v12, v50, v12, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s10, v18, v18
-; GFX10-NEXT: v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v18
-; GFX10-NEXT: v_cmp_u_f32_e64 s11, v1, v1
-; GFX10-NEXT: v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e64 s12, v17, v17
-; GFX10-NEXT: v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s22, v0, v0
-; GFX10-NEXT: v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v33
-; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v14
-; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v35
-; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v13
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v14, v14
-; GFX10-NEXT: v_add3_u32 v14, v27, v14, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v35, v35
-; GFX10-NEXT: v_add3_u32 v29, v29, v35, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v13, v13
-; GFX10-NEXT: v_add3_u32 v13, v34, v13, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v24, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v37, v37, v48, s7
-; GFX10-NEXT: v_bfe_u32 v48, v19, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v52, s8
-; GFX10-NEXT: v_bfe_u32 v52, v2, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v54, v18, s10
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v66, v17, s12
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v68, v0, s22
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v64, v1, s11
+; GFX10-NEXT: v_max_f32_e32 v21, v51, v26
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v35
+; GFX10-NEXT: v_bfe_u32 v36, v13, 16, 1
+; GFX10-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v13
+; GFX10-NEXT: v_bfe_u32 v48, v37, 16, 1
+; GFX10-NEXT: v_add3_u32 v36, v36, v13, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_bfe_u32 v51, v12, 16, 1
+; GFX10-NEXT: v_add3_u32 v48, v48, v37, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v12
+; GFX10-NEXT: v_bfe_u32 v22, v39, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v38, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT: v_add3_u32 v51, v51, v12, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v39
+; GFX10-NEXT: v_bfe_u32 v24, v11, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v39, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v50, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v11
+; GFX10-NEXT: v_bfe_u32 v26, v49, 16, 1
+; GFX10-NEXT: v_add3_u32 v24, v24, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v51, v33, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX10-NEXT: v_bfe_u32 v38, v10, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v49, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v10
+; GFX10-NEXT: v_bfe_u32 v50, v34, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_add3_u32 v38, v38, v10, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v34
+; GFX10-NEXT: v_bfe_u32 v51, v9, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v34, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v35, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v9
+; GFX10-NEXT: v_bfe_u32 v22, v30, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v26, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_bfe_u32 v35, v8, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v30, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v8
+; GFX10-NEXT: v_bfe_u32 v26, v29, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_add3_u32 v35, v35, v8, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v29
+; GFX10-NEXT: v_bfe_u32 v38, v7, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v29, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v50, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v7
+; GFX10-NEXT: v_bfe_u32 v50, v28, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v51, v39, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_bfe_u32 v51, v6, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v28, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_bfe_u32 v22, v27, 16, 1
+; GFX10-NEXT: v_add3_u32 v51, v51, v6, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v27
; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v51
-; GFX10-NEXT: v_bfe_u32 v35, v9, 16, 1
-; GFX10-NEXT: v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v24
-; GFX10-NEXT: v_cmp_u_f32_e64 s13, v51, v51
-; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v51, v7, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s17, v24, v24
-; GFX10-NEXT: v_add3_u32 v24, v65, v24, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v6, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v26, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v49, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_bfe_u32 v49, v5, 16, 1
+; GFX10-NEXT: v_add3_u32 v22, v22, v27, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v28, s4
-; GFX10-NEXT: v_bfe_u32 v28, v4, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v30, s5
-; GFX10-NEXT: v_bfe_u32 v30, v20, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v36, s6
-; GFX10-NEXT: v_bfe_u32 v36, v3, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v19, v19
-; GFX10-NEXT: v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v19
-; GFX10-NEXT: v_cmp_u_f32_e64 s9, v2, v2
-; GFX10-NEXT: v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v2
-; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v9
-; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v25
-; GFX10-NEXT: v_bfe_u32 v53, v8, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9
-; GFX10-NEXT: v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e64 s15, v25, v25
-; GFX10-NEXT: v_add3_u32 v25, v38, v25, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v23, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s18, v7, v7
-; GFX10-NEXT: v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v6
-; GFX10-NEXT: v_cmp_u_f32_e64 s20, v6, v6
-; GFX10-NEXT: v_add3_u32 v6, v65, v6, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v65, v5, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v21, v21
+; GFX10-NEXT: v_add3_u32 v49, v49, v5, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v38, v34, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_bfe_u32 v38, v4, 16, 1
; GFX10-NEXT: v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v21
-; GFX10-NEXT: v_cmp_u_f32_e64 s5, v4, v4
-; GFX10-NEXT: v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e64 s6, v20, v20
-; GFX10-NEXT: v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v20
-; GFX10-NEXT: v_cmp_u_f32_e64 s7, v3, v3
-; GFX10-NEXT: v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v48, v19, s8
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v8
-; GFX10-NEXT: v_cmp_u_f32_e64 s16, v8, v8
-; GFX10-NEXT: v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v23
-; GFX10-NEXT: v_cmp_u_f32_e64 s19, v23, v23
-; GFX10-NEXT: v_add3_u32 v23, v38, v23, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v38, v22, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v50, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_bfe_u32 v50, v20, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v51, v30, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_add3_u32 v50, v50, v20, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v51, v3, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v26, v21, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v28, v4, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v20, v30, v20, s6
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v36, v3, s7
-; GFX10-NEXT: v_perm_b32 v2, v2, v19, 0x7060302
-; GFX10-NEXT: v_cmp_u_f32_e64 s21, v22, v22
-; GFX10-NEXT: v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v22
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v65, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x7060302
-; GFX10-NEXT: v_perm_b32 v4, v4, v21, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e64 v27, v33, v27, s13
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v34, s14
-; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v50, s15
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v55, s16
-; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v67, s17
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v35, s18
-; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v53, s19
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v51, s20
-; GFX10-NEXT: v_cndmask_b32_e64 v22, v38, v22, s21
-; GFX10-NEXT: v_perm_b32 v8, v8, v25, 0x7060302
-; GFX10-NEXT: v_perm_b32 v7, v7, v24, 0x7060302
-; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x7060302
-; GFX10-NEXT: v_perm_b32 v6, v6, v23, 0x7060302
-; GFX10-NEXT: v_perm_b32 v5, v5, v22, 0x7060302
-; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x7060302
-; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x7060302
-; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x7060302
-; GFX10-NEXT: v_perm_b32 v13, v13, v29, 0x7060302
-; GFX10-NEXT: v_perm_b32 v14, v14, v16, 0x7060302
+; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v19
+; GFX10-NEXT: v_add3_u32 v51, v51, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v49, v29, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v49, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v26, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v26, v18, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v18
+; GFX10-NEXT: v_add3_u32 v49, v49, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v38, v28, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_bfe_u32 v38, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v26, v26, v18, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v50, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_bfe_u32 v50, v17, 16, 1
+; GFX10-NEXT: v_add3_u32 v38, v38, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_bfe_u32 v22, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v50, v50, v17, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v26, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v22, v22, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v38, v20, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v50, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_perm_b32 v4, v28, v7, 0x7060302
+; GFX10-NEXT: v_perm_b32 v7, v34, v10, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v22, v18, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_perm_b32 v0, v0, v17, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v49, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v51, v27, vcc_lo
+; GFX10-NEXT: v_perm_b32 v5, v29, v8, 0x7060302
+; GFX10-NEXT: v_perm_b32 v8, v35, v11, 0x7060302
+; GFX10-NEXT: v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT: v_perm_b32 v6, v30, v9, 0x7060302
+; GFX10-NEXT: v_perm_b32 v9, v39, v12, 0x7060302
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v32
; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v32
; GFX10-NEXT: v_max_f32_e32 v17, v31, v17
; GFX10-NEXT: v_max_f32_e32 v15, v15, v18
-; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
-; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v17
-; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v15
+; GFX10-NEXT: v_bfe_u32 v10, v17, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v15, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v17
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v15, v15
-; GFX10-NEXT: v_add3_u32 v17, v18, v17, 0x7fff
-; GFX10-NEXT: v_add3_u32 v15, v19, v15, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v21, s4
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v15
+; GFX10-NEXT: v_add3_u32 v18, v10, v17, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v15, 0x7fff
+; GFX10-NEXT: v_perm_b32 v10, v37, v13, 0x7060302
+; GFX10-NEXT: v_perm_b32 v13, v36, v25, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_perm_b32 v12, v33, v48, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v11, v19, vcc_lo
+; GFX10-NEXT: v_perm_b32 v11, v24, v14, 0x7060302
+; GFX10-NEXT: v_perm_b32 v14, v23, v16, 0x7060302
; GFX10-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -25054,26 +25052,26 @@ define bfloat @v_log_bf16(bfloat %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x800000
-; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GCN-NEXT: s_mov_b32 s5, 0x7f800000
-; GCN-NEXT: v_mov_b32_e32 v2, 0x41b17218
+; GCN-NEXT: v_mov_b32_e32 v1, 0x41b17218
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GCN-NEXT: v_log_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
-; GCN-NEXT: v_sub_f32_e32 v3, v0, v1
-; GCN-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v1
-; GCN-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xfffff000, v0
+; GCN-NEXT: v_sub_f32_e32 v3, v0, v2
+; GCN-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
+; GCN-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
; GCN-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3
; GCN-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3
; GCN-NEXT: v_add_f32_e32 v3, v4, v3
; GCN-NEXT: v_add_f32_e32 v3, v5, v3
-; GCN-NEXT: v_add_f32_e32 v1, v1, v3
+; GCN-NEXT: v_add_f32_e32 v2, v2, v3
; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s5
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -25084,10 +25082,10 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_mov_b32 s4, 0x800000
-; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX7-NEXT: v_log_f32_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s4, 0x3f317217
; GFX7-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
@@ -25109,10 +25107,10 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_mov_b32 s4, 0x800000
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: s_mov_b32 s4, 0x7f800000
; GFX8-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
@@ -25143,10 +25141,10 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_mov_b32 s4, 0x800000
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: s_mov_b32 s4, 0x3f317217
; GFX9-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
@@ -25174,8 +25172,9 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX10-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
@@ -25199,28 +25198,30 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX11-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_f32_e32 v1, v1, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, vcc_lo
; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.log.bf16(bfloat %a)
@@ -25233,14 +25234,14 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x800000
-; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GCN-NEXT: v_mov_b32_e32 v2, 0x42000000
+; GCN-NEXT: v_mov_b32_e32 v1, 0x42000000
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GCN-NEXT: v_log_f32_e32 v0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -25251,10 +25252,10 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_mov_b32 s4, 0x800000
-; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX7-NEXT: v_log_f32_e32 v0, v0
; GFX7-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -25267,10 +25268,10 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_mov_b32 s4, 0x800000
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -25290,9 +25291,9 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_mov_b32 s4, 0x800000
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f800000
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -25311,9 +25312,10 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v2
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
@@ -25330,20 +25332,21 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.log2.bf16(bfloat %a)
@@ -25356,26 +25359,26 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0x800000
-; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GCN-NEXT: s_mov_b32 s5, 0x7f800000
-; GCN-NEXT: v_mov_b32_e32 v2, 0x411a209b
+; GCN-NEXT: v_mov_b32_e32 v1, 0x411a209b
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 5, v2
+; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GCN-NEXT: v_log_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
-; GCN-NEXT: v_sub_f32_e32 v3, v0, v1
-; GCN-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v1
-; GCN-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xfffff000, v0
+; GCN-NEXT: v_sub_f32_e32 v3, v0, v2
+; GCN-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2
+; GCN-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2
; GCN-NEXT: v_mul_f32_e32 v5, 0x3e9a2000, v3
; GCN-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3
; GCN-NEXT: v_add_f32_e32 v3, v4, v3
; GCN-NEXT: v_add_f32_e32 v3, v5, v3
-; GCN-NEXT: v_add_f32_e32 v1, v1, v3
+; GCN-NEXT: v_add_f32_e32 v2, v2, v3
; GCN-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s5
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -25386,10 +25389,10 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_mov_b32 s4, 0x800000
-; GFX7-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX7-NEXT: v_log_f32_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s4, 0x3e9a209a
; GFX7-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -25411,10 +25414,10 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_mov_b32 s4, 0x800000
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: s_mov_b32 s4, 0x7f800000
; GFX8-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
@@ -25445,10 +25448,10 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_mov_b32 s4, 0x800000
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: s_mov_b32 s4, 0x3e9a209a
; GFX9-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -25476,8 +25479,9 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX10-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
@@ -25501,28 +25505,30 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: v_log_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX11-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
; GFX11-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_f32_e32 v1, v1, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, vcc_lo
; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.log10.bf16(bfloat %a)
@@ -25719,14 +25725,14 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_mov_b32 s4, 0xc2fc0000
; GCN-NEXT: v_mov_b32_e32 v1, 0x42800000
-; GCN-NEXT: v_mov_b32_e32 v2, 0x1f800000
+; GCN-NEXT: v_not_b32_e32 v2, 63
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
; GCN-NEXT: v_exp_f32_e32 v0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -25741,9 +25747,9 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
; GFX7-NEXT: v_exp_f32_e32 v0, v0
-; GFX7-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX7-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_not_b32_e32 v1, 63
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -25757,9 +25763,9 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_not_b32_e32 v1, 63
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_ldexp_f32 v0, v0, v1
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
@@ -25779,10 +25785,10 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX9-NEXT: v_not_b32_e32 v1, 63
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT: v_ldexp_f32 v0, v0, v1
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0
@@ -25797,10 +25803,10 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -25816,12 +25822,12 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_exp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -34500,14 +34506,25 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_select_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_select_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_select_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = select i1 %cond, bfloat %a, bfloat %b
ret bfloat %op
}
@@ -34565,11 +34582,14 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v1.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
@@ -34639,11 +34659,14 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_xor_b16 v2.l, 0x8000, v2.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
@@ -34741,13 +34764,12 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_cndmask_b32 v0, v2, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v3.l, vcc_lo
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_select_v2bf16:
@@ -34848,17 +34870,16 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
; GFX11TRUE16-LABEL: v_vselect_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v4.l, vcc_lo
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, s0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v2bf16:
@@ -34929,16 +34950,27 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_select_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: ; return to shader part epilog
+; GFX11TRUE16-LABEL: s_select_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11FAKE16-LABEL: s_select_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: v_mov_b32_e32 v1, s0
+; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo
+; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11FAKE16-NEXT: ; return to shader part epilog
%cond = icmp eq i32 %c, 0
%op = select i1 %cond, bfloat %a, bfloat %b
%cast = bitcast bfloat %op to i16
@@ -35031,18 +35063,18 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
;
; GFX11TRUE16-LABEL: s_select_v2bf16:
; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_lshr_b32 s2, s1, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s3
+; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_cndmask_b32 v1, v3, v4
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, vcc_lo
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_select_v2bf16:
@@ -35149,18 +35181,17 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
;
; GFX11TRUE16-LABEL: s_vselect_v2bf16:
; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, s2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s3
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11TRUE16-NEXT: ; return to shader part epilog
@@ -35650,81 +35681,81 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GCN-LABEL: v_select_v16bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_alignbit_b32 v0, v0, v1, 16
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v17
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v19
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_alignbit_b32 v2, v2, v17, 16
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21
-; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8
+; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16
+; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v23
-; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v25
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v27
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v30
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16
-; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_alignbit_b32 v4, v4, v17, 16
+; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32
-; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16
+; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16
+; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16
+; GCN-NEXT: v_alignbit_b32 v8, v20, v21, 16
+; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; GCN-NEXT: v_alignbit_b32 v10, v22, v23, 16
+; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; GCN-NEXT: v_alignbit_b32 v12, v24, v25, 16
; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; GCN-NEXT: v_alignbit_b32 v14, v19, v20, 16
+; GCN-NEXT: v_alignbit_b32 v14, v26, v27, 16
; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -35757,67 +35788,67 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v18
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_alignbit_b32 v3, v4, v3, 16
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v20
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_alignbit_b32 v2, v2, v17, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22
+; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v22
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v17, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v21
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v21
; GFX7-NEXT: v_alignbit_b32 v7, v8, v7, 16
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v24
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_alignbit_b32 v6, v6, v17, 16
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_alignbit_b32 v18, v18, v19, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v23
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v23
; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v26
-; GFX7-NEXT: v_alignbit_b32 v8, v8, v17, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_alignbit_b32 v10, v10, v17, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v28
-; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16
-; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v27
-; GFX7-NEXT: v_alignbit_b32 v17, v17, v18, 16
-; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_alignbit_b32 v8, v8, v19, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v25
+; GFX7-NEXT: v_alignbit_b32 v11, v12, v11, 16
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v28
; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30
; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_alignbit_b32 v10, v10, v19, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v27
+; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v30
+; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
+; GFX7-NEXT: v_alignbit_b32 v12, v12, v19, 16
; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v29
-; GFX7-NEXT: v_alignbit_b32 v15, v16, v15, 16
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_alignbit_b32 v14, v14, v19, 16
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
@@ -35826,21 +35857,21 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v18
-; GFX7-NEXT: v_alignbit_b32 v12, v12, v16, 16
-; GFX7-NEXT: v_cndmask_b32_e32 v15, v12, v15, vcc
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v17
+; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_alignbit_b32 v6, v16, v6, 16
+; GFX7-NEXT: v_cndmask_b32_e32 v15, v6, v15, vcc
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -36869,32 +36900,30 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
;
; GFX11TRUE16-LABEL: s_vselect_v4bf16:
; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, s3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, s1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX11TRUE16-NEXT: s_lshr_b32 s4, s3, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s1, s2, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, s1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, s3
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, s2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, s0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s4
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, s5
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo
+; GFX11TRUE16-NEXT: s_lshr_b32 s7, s3, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1
+; GFX11TRUE16-NEXT: s_lshr_b32 s8, s1, 16
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
+; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s7, s0, 16
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s8
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s7
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, s2
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, s1
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s6
+; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v2.l, s4
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v3.l, vcc_lo
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v3.h, s5
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v4
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v0
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_vselect_v4bf16:
@@ -37070,28 +37099,24 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
; GFX11TRUE16-LABEL: v_vselect_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v7
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v5
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v6
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v6
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v2
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v3
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v5
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v7
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v0
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v1
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v9.l, v8.l, vcc_lo
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v3.l, v2.l, s0
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v4.l, s1
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v5.l, s2
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v4bf16:
@@ -37125,30 +37150,30 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
; GCN-LABEL: v_vselect_v8bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v7, 1, v7
+; GCN-NEXT: v_and_b32_e32 v6, 1, v6
+; GCN-NEXT: v_and_b32_e32 v5, 1, v5
+; GCN-NEXT: v_and_b32_e32 v4, 1, v4
+; GCN-NEXT: v_and_b32_e32 v3, 1, v3
+; GCN-NEXT: v_and_b32_e32 v2, 1, v2
+; GCN-NEXT: v_and_b32_e32 v1, 1, v1
+; GCN-NEXT: v_and_b32_e32 v0, 1, v0
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_and_b32_e32 v1, 1, v1
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_and_b32_e32 v2, 1, v2
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_and_b32_e32 v3, 1, v3
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v5, 1, v5
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v6, 1, v6
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v7, 1, v7
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GCN-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
@@ -37179,45 +37204,45 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
; GFX7-NEXT: v_cndmask_b32_e32 v7, v23, v15, vcc
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v22
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v15, v14, vcc
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v14, vcc
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v21
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX7-NEXT: v_cndmask_b32_e32 v5, v14, v13, vcc
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v13, vcc
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX7-NEXT: v_cndmask_b32_e32 v4, v13, v12, vcc
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v20, v12, vcc
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v19
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v12, v11, vcc
; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v19, v11, vcc
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v18
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v18, v10, vcc
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v17
+; GFX7-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v16
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v12, v9, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v11, v8, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
@@ -37359,52 +37384,39 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
; GFX11TRUE16-LABEL: v_vselect_v8bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v15
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v11
-; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v14
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v10
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v18.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v19.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v9
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v16, v17, vcc_lo
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v12
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v8
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v11, v15, vcc_lo
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v13.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v16.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v17.l
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v7.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v13, v15, vcc_lo
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v6.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v11, v9, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v5.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc_lo
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v2
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
+; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v3
+; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v4
+; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v5
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v0
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v7
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v1
+; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v6
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v2
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v3
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v0
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v15
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v10
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v14
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v9
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v13
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v8
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v1
+; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v2.l, v0.l, s5
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v4.l, v3.l, s4
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v6.l, v5.l, s2
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v16.l, v7.l, s0
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v12.l, v8.l, vcc_lo
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v13.l, v9.l, s1
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v14.l, v10.l, s3
+; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v15.l, v11.l, s6
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v8bf16:
@@ -37487,16 +37499,16 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v1, 1, v10
; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v1
-; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17
-; GCN-NEXT: v_and_b32_e32 v3, 1, v11
-; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v3
-; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v18
-; GCN-NEXT: v_and_b32_e32 v5, 1, v12
-; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v5
+; GCN-NEXT: v_and_b32_e32 v2, 1, v11
+; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v2
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v18
+; GCN-NEXT: v_and_b32_e32 v3, 1, v12
+; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v3
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:12
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v19
; GCN-NEXT: v_and_b32_e32 v7, 1, v13
; GCN-NEXT: v_and_b32_e32 v8, 1, v14
; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v7
@@ -37563,22 +37575,22 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[12:13]
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[10:11]
-; GCN-NEXT: v_cndmask_b32_e64 v5, v18, v5, s[8:9]
-; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[8:9]
+; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v19
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v17
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16
@@ -37604,151 +37616,136 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX7-LABEL: v_vselect_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
+; GFX7-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX7-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v8
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v7
+; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64
+; GFX7-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v15
+; GFX7-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v14
+; GFX7-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v13
+; GFX7-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v12
+; GFX7-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11
+; GFX7-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX7-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX7-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX7-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v9
+; GFX7-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v2
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v3
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v4
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v5
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v6
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v7
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v8
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v9
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v10
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 1, v11
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
-; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32
-; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64
-; GFX7-NEXT: v_and_b32_e32 v2, 1, v12
-; GFX7-NEXT: v_writelane_b32 v31, s30, 0
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2
-; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60
-; GFX7-NEXT: v_and_b32_e32 v3, 1, v13
-; GFX7-NEXT: v_writelane_b32 v31, s31, 1
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3
-; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56
-; GFX7-NEXT: v_and_b32_e32 v4, 1, v14
-; GFX7-NEXT: v_writelane_b32 v31, s34, 2
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4
-; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52
-; GFX7-NEXT: v_and_b32_e32 v5, 1, v15
-; GFX7-NEXT: v_writelane_b32 v31, s35, 3
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v5
-; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v15, v1, v0, s[34:35]
-; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v30
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v14, v2, v1, s[30:31]
-; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v29
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_cndmask_b32_e64 v13, v3, v2, s[28:29]
-; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v28
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_cndmask_b32_e64 v12, v4, v3, s[26:27]
-; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v27
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_cndmask_b32_e64 v11, v5, v4, s[24:25]
-; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v26
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e64 v15, v8, v7, s[12:13]
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v30
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_readlane_b32 s35, v31, 3
-; GFX7-NEXT: v_readlane_b32 s34, v31, 2
-; GFX7-NEXT: v_readlane_b32 s31, v31, 1
-; GFX7-NEXT: v_readlane_b32 s30, v31, 0
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v10, v0, v5, s[22:23]
-; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v25
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_cndmask_b32_e64 v9, v1, v5, s[20:21]
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v24
-; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v8, v2, v5, s[18:19]
-; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v23
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_cndmask_b32_e64 v7, v3, v5, s[16:17]
-; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v22
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_cndmask_b32_e64 v6, v4, v5, s[14:15]
-; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e64 v14, v8, v7, s[10:11]
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v29
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e64 v13, v8, v7, s[8:9]
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v28
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e64 v12, v8, v7, s[6:7]
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e64 v11, v8, v7, s[4:5]
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v26
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e32 v10, v8, v7, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v22
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v25
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v21
+; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e64 v9, v8, v7, s[18:19]
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v24
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[12:13]
-; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[16:17]
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v23
+; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v17, s[4:5]
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[14:15]
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc
+; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v18, s[6:7]
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v18, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v19, s[8:9]
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v20, v0, v20, s[10:11]
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v16, vcc
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v18, v16, vcc
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v20
-; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX7-NEXT: s_mov_b64 exec, s[4:5]
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v19
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_vselect_v16bf16:
@@ -37779,53 +37776,51 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX8-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v10
; GFX8-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
-; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v11
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v12
; GFX8-NEXT: v_writelane_b32 v31, s30, 0
-; GFX8-NEXT: v_and_b32_e32 v2, 1, v12
-; GFX8-NEXT: v_and_b32_e32 v3, 1, v13
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v13
; GFX8-NEXT: v_writelane_b32 v31, s31, 1
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v22
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v30
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v14
; GFX8-NEXT: v_writelane_b32 v31, s34, 2
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v11
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v14
-; GFX8-NEXT: v_and_b32_e32 v5, 1, v15
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v3, v2, s[28:29]
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v20
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v28
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v15
; GFX8-NEXT: v_writelane_b32 v31, s35, 3
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v1
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v22
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v30
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v0, s[28:29]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v21
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v29
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v1, v0, s[24:25]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v28
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, v0, s[20:21]
+; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v23
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v4
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v2, s[20:21]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v21
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v29
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[24:25]
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v19
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v27
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v4, s[16:17]
; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v24
; GFX8-NEXT: v_cndmask_b32_e64 v7, v30, v22, s[26:27]
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v27, v19, s[14:15]
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX8-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[22:23]
-; GFX8-NEXT: v_cndmask_b32_e64 v11, v28, v20, s[18:19]
-; GFX8-NEXT: v_cndmask_b32_e64 v9, v26, v18, s[10:11]
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v28, v20, s[18:19]
+; GFX8-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15]
+; GFX8-NEXT: v_cndmask_b32_e64 v13, v26, v18, s[10:11]
; GFX8-NEXT: v_cndmask_b32_e64 v14, v25, v17, s[6:7]
-; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v12, v0, v23, s[30:31]
-; GFX8-NEXT: v_cndmask_b32_e64 v13, v2, v1, s[34:35]
+; GFX8-NEXT: v_cndmask_b32_e64 v10, v0, v23, s[30:31]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v0, v1, s[34:35]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v27
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[16:17]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v18
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v26
; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[12:13]
@@ -37838,11 +37833,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v11
; GFX8-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v7, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_readlane_b32 s35, v31, 3
; GFX8-NEXT: v_readlane_b32 s34, v31, 2
; GFX8-NEXT: v_readlane_b32 s31, v31, 1
@@ -37856,81 +37853,81 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX9-LABEL: v_vselect_v16bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v4
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v14
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v4
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v15
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v4
-; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32
; GFX9-NEXT: v_and_b32_e32 v12, 1, v12
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
-; GFX9-NEXT: v_and_b32_e32 v12, 1, v13
+; GFX9-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
; GFX9-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v22, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
; GFX9-NEXT: v_and_b32_e32 v10, 1, v11
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v29, v21, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v29
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v22, v21, vcc
+; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32
; GFX9-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v6
-; GFX9-NEXT: v_and_b32_e32 v6, 1, v7
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v22
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v30
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v10
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v8
-; GFX9-NEXT: v_and_b32_e32 v8, 1, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[4:5]
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v21
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v29
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[8:9]
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v28
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[12:13]
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v19
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v27
+; GFX9-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v20
+; GFX9-NEXT: v_cndmask_b32_e32 v20, v28, v20, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v28
+; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX9-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v22, v8, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v27
+; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v22, v9, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26
+; GFX9-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX9-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v27, v6, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v11, v13, v11, s[16:17]
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v23
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v30, v22, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v23
; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v15, v26, v18, s[18:19]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v29, v21, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v10, v28, v20, s[10:11]
-; GFX9-NEXT: v_cndmask_b32_e64 v12, v27, v19, s[14:15]
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v5, v5, v8, s4
-; GFX9-NEXT: v_perm_b32 v6, v7, v6, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v14, v4, v23, s[20:21]
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v13, v4, v13, s[22:23]
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v26
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[24:25]
-; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v25
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v18, v17, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v14, v21, v23, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v21
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v15, v6, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v15, v6, vcc
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
-; GFX9-NEXT: v_perm_b32 v2, v4, v15, s4
-; GFX9-NEXT: v_perm_b32 v3, v11, v12, s4
-; GFX9-NEXT: v_perm_b32 v4, v9, v10, s4
-; GFX9-NEXT: v_perm_b32 v7, v13, v14, s4
+; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4
+; GFX9-NEXT: v_perm_b32 v3, v9, v19, s4
+; GFX9-NEXT: v_perm_b32 v4, v8, v20, s4
+; GFX9-NEXT: v_perm_b32 v5, v10, v11, s4
+; GFX9-NEXT: v_perm_b32 v6, v13, v12, s4
+; GFX9-NEXT: v_perm_b32 v7, v7, v14, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v16bf16:
@@ -37947,13 +37944,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v21
; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v29
-; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v30, v22, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v20
; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX10-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
@@ -37962,13 +37959,13 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX10-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v25
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v25
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v16
-; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v24
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v18
; GFX10-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo
@@ -37987,11 +37984,11 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
; GFX10-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v52, v51, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v12, v51, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v54, v53, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v13, v30, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX10-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo
@@ -38004,113 +38001,84 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v31
-; GFX10-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v31, v23, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX10-NEXT: v_cndmask_b32_e32 v15, v3, v32, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v3, v32, vcc_lo
; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
-; GFX10-NEXT: v_perm_b32 v6, v13, v12, 0x5040100
-; GFX10-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
+; GFX10-NEXT: v_perm_b32 v6, v33, v22, 0x5040100
+; GFX10-NEXT: v_perm_b32 v7, v13, v12, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_vselect_v16bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v30
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v22.l, v36.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v30.l, v35.l
; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v20.l, v48.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.l
+; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v21.l, v38.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v29.l, v37.l
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v30
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v29
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v20
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v28
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v19
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v27
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v26
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v25
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v19.l, v50.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v27.l, v49.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v24
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v26.l, v51.l
-; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v17.l, v54.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v25.l, v53.l
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.h, v13.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v22, v30, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v21, v29, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.h, v9.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v20, v28, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.h, v11.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v16, v19, v27, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v7.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v6.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v12.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v18, v26, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v18.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v17, v25, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v16.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v17.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v2
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v3
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v4
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v5
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v6
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v7
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v8
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v9
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v10
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v13
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v12
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v11
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v14
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v15
+; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v34.l, v33.l, s10
+; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v30.l, v22.l, s11
+; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v36.l, v35.l, s12
+; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v29.l, v21.l, s9
+; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v38.l, v37.l, s8
+; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v28.l, v20.l, s7
+; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v48.l, v39.l, s6
+; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v27.l, v19.l, s5
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v50.l, v49.l, s4
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v52.l, v51.l, s2
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v54.l, v53.l, s0
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v24.l, v16.l, vcc_lo
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v25.l, v17.l, s1
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v26.l, v18.l, s3
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.l
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v31
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v10, v23, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v32, vcc_lo
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.h, v8.l
+; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v31.l, v23.l, s13
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v8.l, v32.l, s14
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v16bf16:
@@ -39400,219 +39368,206 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-LABEL: v_vselect_v32bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_or_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: v_and_b32_e32 v29, 1, v29
+; GFX10-NEXT: s_clause 0xa
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104
+; GFX10-NEXT: buffer_load_ushort v35, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128
+; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64
+; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96
+; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108
+; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44
+; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112
; GFX10-NEXT: v_and_b32_e32 v30, 1, v30
+; GFX10-NEXT: v_and_b32_e32 v18, 1, v18
+; GFX10-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX10-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX10-NEXT: v_and_b32_e32 v19, 1, v19
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30
+; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v18
; GFX10-NEXT: v_and_b32_e32 v28, 1, v28
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v13
+; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v19
; GFX10-NEXT: v_and_b32_e32 v26, 1, v26
; GFX10-NEXT: v_and_b32_e32 v24, 1, v24
; GFX10-NEXT: v_and_b32_e32 v22, 1, v22
; GFX10-NEXT: v_and_b32_e32 v20, 1, v20
-; GFX10-NEXT: v_and_b32_e32 v18, 1, v18
+; GFX10-NEXT: v_and_b32_e32 v21, 1, v21
; GFX10-NEXT: v_and_b32_e32 v16, 1, v16
; GFX10-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX10-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX10-NEXT: s_clause 0x14
-; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
-; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX10-NEXT: buffer_load_ushort v33, off, s[0:3], s32
-; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128
-; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64
-; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48
-; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:116
-; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52
-; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120
-; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56
-; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32
-; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100
-; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36
-; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104
-; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40
-; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:108
-; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44
-; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:112
-; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:72
-; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:76
-; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:80
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92
-; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:28
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v30
-; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96
-; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v28
-; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88
-; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v26
-; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 1, v24
-; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84
-; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 1, v22
-; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20
-; GFX10-NEXT: v_cmp_eq_u32_e64 s9, 1, v20
-; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
-; GFX10-NEXT: v_cmp_eq_u32_e64 s10, 1, v18
-; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; GFX10-NEXT: v_cmp_eq_u32_e64 s11, 1, v16
-; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
-; GFX10-NEXT: v_cmp_eq_u32_e64 s12, 1, v14
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:68
-; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:24
-; GFX10-NEXT: v_cmp_eq_u32_e64 s13, 1, v12
-; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX10-NEXT: v_and_b32_e32 v17, 1, v17
+; GFX10-NEXT: v_and_b32_e32 v15, 1, v15
; GFX10-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
-; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX10-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX10-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX10-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX10-NEXT: v_and_b32_e32 v17, 1, v17
-; GFX10-NEXT: v_and_b32_e32 v19, 1, v19
-; GFX10-NEXT: v_and_b32_e32 v21, 1, v21
-; GFX10-NEXT: v_and_b32_e32 v23, 1, v23
-; GFX10-NEXT: v_and_b32_e32 v25, 1, v25
-; GFX10-NEXT: v_and_b32_e32 v27, 1, v27
-; GFX10-NEXT: v_cmp_eq_u32_e64 s14, 1, v10
-; GFX10-NEXT: v_cmp_eq_u32_e64 s15, 1, v8
-; GFX10-NEXT: v_cmp_eq_u32_e64 s16, 1, v6
-; GFX10-NEXT: v_cmp_eq_u32_e64 s17, 1, v4
-; GFX10-NEXT: v_cmp_eq_u32_e64 s18, 1, v2
-; GFX10-NEXT: v_cmp_eq_u32_e64 s19, 1, v0
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
-; GFX10-NEXT: v_cmp_eq_u32_e64 s20, 1, v27
-; GFX10-NEXT: v_cmp_eq_u32_e64 s21, 1, v25
-; GFX10-NEXT: v_cmp_eq_u32_e64 s22, 1, v23
-; GFX10-NEXT: v_cmp_eq_u32_e64 s23, 1, v21
-; GFX10-NEXT: v_cmp_eq_u32_e64 s24, 1, v19
-; GFX10-NEXT: v_cmp_eq_u32_e64 s25, 1, v17
-; GFX10-NEXT: v_cmp_eq_u32_e64 s26, 1, v15
-; GFX10-NEXT: v_cmp_eq_u32_e64 s27, 1, v13
-; GFX10-NEXT: v_cmp_eq_u32_e64 s28, 1, v11
-; GFX10-NEXT: v_cmp_eq_u32_e64 s29, 1, v7
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_hi, 1, v3
-; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v1
-; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v5
-; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v9
-; GFX10-NEXT: s_waitcnt vmcnt(32)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v31
-; GFX10-NEXT: s_waitcnt vmcnt(31)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v32
-; GFX10-NEXT: s_waitcnt vmcnt(30)
-; GFX10-NEXT: v_and_b32_e32 v2, 1, v33
-; GFX10-NEXT: s_waitcnt vmcnt(29)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v34
-; GFX10-NEXT: s_waitcnt vmcnt(28)
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v34, v35, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v35
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v32, v31, s5
-; GFX10-NEXT: s_waitcnt vmcnt(25)
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v37, v38, s7
-; GFX10-NEXT: s_waitcnt vmcnt(24)
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v39
-; GFX10-NEXT: s_waitcnt vmcnt(23)
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v39, v48, s6
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v48
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v38
-; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v37
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v36
-; GFX10-NEXT: s_waitcnt vmcnt(18)
-; GFX10-NEXT: v_cndmask_b32_e64 v27, v52, v53, s10
-; GFX10-NEXT: s_waitcnt vmcnt(17)
-; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v54
-; GFX10-NEXT: s_waitcnt vmcnt(16)
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v54, v55, s9
-; GFX10-NEXT: s_waitcnt vmcnt(15)
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v64, v36, s8
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v64
-; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v55
-; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v53
-; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v52
-; GFX10-NEXT: v_cndmask_b32_e64 v33, v50, v51, s11
-; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v51
-; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v50
+; GFX10-NEXT: s_waitcnt vmcnt(10)
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v31
; GFX10-NEXT: s_waitcnt vmcnt(9)
-; GFX10-NEXT: v_cndmask_b32_e64 v36, v30, v49, s12
-; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v49
-; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX10-NEXT: v_cndmask_b32_e64 v38, v29, v68, s13
-; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v68
-; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v32
+; GFX10-NEXT: s_waitcnt vmcnt(8)
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v33
+; GFX10-NEXT: s_waitcnt vmcnt(7)
+; GFX10-NEXT: v_cndmask_b32_e64 v18, v34, v33, s6
; GFX10-NEXT: s_waitcnt vmcnt(6)
-; GFX10-NEXT: v_cndmask_b32_e64 v49, v24, v22, s15
-; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GFX10-NEXT: s_waitcnt vmcnt(5)
-; GFX10-NEXT: v_cndmask_b32_e64 v50, v67, v20, s16
-; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v67
+; GFX10-NEXT: v_and_b32_e32 v35, 1, v35
+; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v12
; GFX10-NEXT: s_waitcnt vmcnt(4)
-; GFX10-NEXT: v_cndmask_b32_e64 v52, v66, v18, s17
-; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_cndmask_b32_e64 v48, v28, v26, s14
-; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v66
-; GFX10-NEXT: v_cndmask_b32_e64 v54, v65, v16, s18
-; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v65
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v36, v37, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v35
+; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v34
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v32, v31, s6
+; GFX10-NEXT: s_clause 0x6
+; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76
+; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12
+; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80
+; GFX10-NEXT: v_cndmask_b32_e64 v30, v50, v30, s4
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc_lo
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:124
+; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
+; GFX10-NEXT: v_and_b32_e32 v28, 1, v29
+; GFX10-NEXT: v_cndmask_b32_e64 v13, v51, v13, s5
+; GFX10-NEXT: s_waitcnt vmcnt(3)
+; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v52
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e64 v64, v14, v12, s19
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v65, v1, v0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v66, v6, v5, s20
-; GFX10-NEXT: v_cndmask_b32_e64 v67, v8, v7, s21
-; GFX10-NEXT: v_cndmask_b32_e64 v68, v10, v9, s22
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v25, v23, s23
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v32, v31, s24
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v35, v34, s25
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v30, v37, s26
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v29, v39, s27
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v28, v26, s28
-; GFX10-NEXT: v_cndmask_b32_e64 v20, v51, v20, s29
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s30
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, vcc_hi
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s31
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s34
-; GFX10-NEXT: v_cndmask_b32_e64 v16, v4, v3, s4
-; GFX10-NEXT: v_perm_b32 v0, v0, v64, 0x5040100
-; GFX10-NEXT: v_perm_b32 v1, v1, v54, 0x5040100
-; GFX10-NEXT: v_perm_b32 v2, v2, v52, 0x5040100
-; GFX10-NEXT: v_perm_b32 v3, v20, v50, 0x5040100
-; GFX10-NEXT: v_perm_b32 v4, v12, v49, 0x5040100
-; GFX10-NEXT: v_perm_b32 v5, v5, v48, 0x5040100
-; GFX10-NEXT: v_perm_b32 v6, v6, v38, 0x5040100
-; GFX10-NEXT: v_perm_b32 v7, v7, v36, 0x5040100
-; GFX10-NEXT: v_perm_b32 v8, v8, v33, 0x5040100
-; GFX10-NEXT: v_perm_b32 v9, v9, v27, 0x5040100
-; GFX10-NEXT: v_perm_b32 v10, v10, v21, 0x5040100
-; GFX10-NEXT: v_perm_b32 v11, v68, v11, 0x5040100
-; GFX10-NEXT: v_perm_b32 v12, v67, v19, 0x5040100
-; GFX10-NEXT: v_perm_b32 v13, v66, v13, 0x5040100
-; GFX10-NEXT: v_perm_b32 v14, v65, v17, 0x5040100
-; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
-; GFX10-NEXT: v_readlane_b32 s34, v40, 2
-; GFX10-NEXT: v_readlane_b32 s31, v40, 1
-; GFX10-NEXT: v_readlane_b32 s30, v40, 0
-; GFX10-NEXT: s_or_saveexec_b32 s4, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 exec_lo, s4
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v36, v37, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v36, v37, vcc_lo
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120
+; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
+; GFX10-NEXT: v_and_b32_e32 v26, 1, v27
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc_lo
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116
+; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
+; GFX10-NEXT: v_and_b32_e32 v24, 1, v25
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v36, v37, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v36, v37, vcc_lo
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
+; GFX10-NEXT: v_and_b32_e32 v22, 1, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v49
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v49, v36, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
+; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v53
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v37, v36, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v48
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v39
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v39, v48, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32
+; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v37, v36, vcc_lo
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100
+; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v36, v37, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v38, v39, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v36, v37, vcc_lo
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88
+; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v38, v39, vcc_lo
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84
+; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX10-NEXT: s_waitcnt vmcnt(2)
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v36, v37, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v38, v39, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v38
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v53, v48, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v48
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v34, v52, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v32, v33, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v19, v31, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v36, v37, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v49, v48, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v32, v33, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v19, v31, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v34, v50, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
+; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
+; GFX10-NEXT: v_perm_b32 v6, v30, v12, 0x5040100
+; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v38, v39, vcc_lo
+; GFX10-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
+; GFX10-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
+; GFX10-NEXT: v_perm_b32 v10, v21, v20, 0x5040100
+; GFX10-NEXT: v_perm_b32 v11, v22, v23, 0x5040100
+; GFX10-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
+; GFX10-NEXT: v_perm_b32 v8, v17, v16, 0x5040100
+; GFX10-NEXT: v_perm_b32 v9, v13, v18, 0x5040100
+; GFX10-NEXT: v_perm_b32 v12, v24, v25, 0x5040100
+; GFX10-NEXT: v_perm_b32 v13, v26, v27, 0x5040100
+; GFX10-NEXT: v_perm_b32 v14, v28, v29, 0x5040100
+; GFX10-NEXT: v_perm_b32 v15, v35, v54, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_vselect_v32bf16:
@@ -39652,209 +39607,167 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:72
; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:4
; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:68
-; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30
-; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
-; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
-; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
+; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
+; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30
+; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24
+; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
+; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
+; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
-; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
-; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
+; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
+; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
+; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
+; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 1, v17
+; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 1, v19
+; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 1, v21
+; GFX11TRUE16-NEXT: v_and_b32_e32 v23, 1, v23
+; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 1, v25
+; GFX11TRUE16-NEXT: v_and_b32_e32 v27, 1, v27
+; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v14
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v18
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v20
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v22
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s23, 1, v24
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s25, 1, v26
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s27, 1, v28
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s28, 1, v30
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v2
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v3
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v4
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v5
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v6
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v7
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v8
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v9
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v10
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v11
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v12
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v13
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v15
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s16, 1, v17
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s18, 1, v19
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s20, 1, v21
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s22, 1, v23
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s24, 1, v25
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v27
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v29
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v31
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v32
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v32
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v32, v33, v32, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v33
+; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v32.l, s28
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v98, 16, v34
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v34
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v35
+; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v34.l, s27
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v100, 16, v36
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v36
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v37
+; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v37.l, v36.l, s25
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v102, 16, v38
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v34, v35, v34, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
-; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v38
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v39
+; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v39.l, v38.l, s23
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(23)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v48
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v49
+; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v49.l, v48.l, s21
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v50
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v50
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v115, 16, v51
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v51
+; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v51.l, v50.l, s19
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v116, 16, v52
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v26, v37, v36, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
-; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v52
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v53
+; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v53.l, v52.l, s17
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v118, 16, v54
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v54
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v119, 16, v55
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v117, 16, v53
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v24, v39, v38, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
-; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v112, 16, v48
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v113, 16, v49
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v103, 16, v39
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v22, v49, v48, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20
-; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v48.l, v115.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v49.l, v114.l
-; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v20, v51, v50, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18
-; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v50.l, v117.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v51.l, v116.l
-; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v53, v52, vcc_lo
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v52.l, v119.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v53.l, v118.l
-; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
-; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v38.l, v113.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v39.l, v112.l
-; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v16, v55, v54, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 1, v17
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v101, 16, v37
-; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v36.l, v103.l
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v55
+; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v55.l, v54.l, s15
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(15)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v64
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v65, v64, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 1, v19
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v37.l, v102.l
-; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v97, 16, v33
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v65
+; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v65.l, v64.l, s13
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(13)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v66
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v67, v66, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 1, v21
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v35
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v67
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(11)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v68
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v69, v68, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11TRUE16-NEXT: v_and_b32_e32 v23, 1, v23
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v128, 16, v64
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v65
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v130, 16, v66
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v69
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(9)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v70
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v71, v70, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 1, v25
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v131, 16, v67
-; GFX11TRUE16-NEXT: v_mov_b16_e64 v54.l, v129.l
-; GFX11TRUE16-NEXT: v_mov_b16_e64 v55.l, v128.l
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v71
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v49, 16, v80
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v81, v80, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v27, 1, v27
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v68
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v133, 16, v69
-; GFX11TRUE16-NEXT: v_mov_b16_e64 v64.l, v131.l
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v81
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v82
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v83, v82, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
-; GFX11TRUE16-NEXT: v_mov_b16_e64 v65.l, v130.l
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v70
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v71
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v83
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v53, 16, v84
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v85, v84, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 1, v31
-; GFX11TRUE16-NEXT: v_mov_b16_e64 v66.l, v133.l
-; GFX11TRUE16-NEXT: v_mov_b16_e64 v67.l, v132.l
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v144, 16, v80
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v54, 16, v85
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v86
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v87, v86, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v145, 16, v81
-; GFX11TRUE16-NEXT: v_mov_b16_e64 v68.l, v135.l
-; GFX11TRUE16-NEXT: v_mov_b16_e64 v69.l, v134.l
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v146, 16, v82
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v31, v97, v96, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v147, 16, v83
-; GFX11TRUE16-NEXT: v_mov_b16_e64 v70.l, v145.l
-; GFX11TRUE16-NEXT: v_mov_b16_e64 v71.l, v144.l
-; GFX11TRUE16-NEXT: v_mov_b16_e64 v81.l, v146.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v29, v99, v98, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v27
-; GFX11TRUE16-NEXT: v_mov_b16_e64 v80.l, v147.l
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v84
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v85
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v86
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v101, v100, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v25
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v35, 16, v87
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.h, v29.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v25, v36, v37, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v23
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.h, v25.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v23, v38, v39, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v21, v48, v49, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v19
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.h, v21.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v19, v50, v51, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v52, v53, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.l, v32.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v15.h, v31.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.h, v17.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v36, v54, v55, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v26.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.h, v27.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v37, v64, v65, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v22.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.h, v23.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.h, v37.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v38, v66, v67, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v18.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.h, v19.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v39, v68, v69, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v14.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v34.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.h, v36.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.h, v39.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v48, v70, v71, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v20.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.h, v38.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v49, v80, v81, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v6.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v12.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v49.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v30, v33, v30, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v16.l
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v28, v35, v28, vcc_lo
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v30.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v48.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v28.l
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v87
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v87.l, v86.l, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v67.l, v66.l, s11
+; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v69.l, v68.l, s9
+; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v71.l, v70.l, s7
+; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v81.l, v80.l, s5
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v83.l, v82.l, s3
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v85.l, v84.l, s1
+; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v20.l, v19.l, s29
+; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v22.l, v21.l, s26
+; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v24.l, v23.l, s24
+; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v26.l, v25.l, s22
+; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v28.l, v27.l, s20
+; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v30.l, v29.l, s18
+; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v32.l, v31.l, s16
+; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v34.l, v33.l, s14
+; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v36.l, v35.l, s12
+; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v38.l, v37.l, s10
+; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v48.l, v39.l, s8
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v64.l, v55.l, s0
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v54.l, v53.l, s2
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v52.l, v51.l, s4
+; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v50.l, v49.l, s6
+; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v18.l, v17.l, vcc_lo
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v32bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index b4dbe0e7be9245..768c6c0ac7a299 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -214,13 +214,21 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_test_canonicalize_build_vector_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_test_canonicalize_build_vector_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_test_canonicalize_build_vector_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%ins0 = insertelement <2 x half> undef, half %lo, i32 0
%ins1 = insertelement <2 x half> %ins0, half %hi, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
@@ -2799,14 +2807,23 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT: v_mov_b32_e32 v1, 0x7e007e00
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0x7e007e00
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0x7e007e00
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%vec0 = insertelement <4 x half> undef, half %val0, i32 0
%vec1 = insertelement <4 x half> %vec0, half %val1, i32 1
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1)
@@ -2850,7 +2867,7 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 0
; GFX11-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index e5df5d3e77a239..9540aa322f6ef7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -480,11 +480,8 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -604,12 +601,9 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -731,13 +725,10 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v4, s0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v4, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -804,13 +795,13 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v5, 0x7fff
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v4, 0xffff8000, v4, v5
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v2, 0xffff8000, v2, v5
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v3, 0xffff8000, v3, v5
-; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v4, 0xffff8000, v4, v5
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -877,12 +868,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -937,8 +925,8 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v4.l, v5.l
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -1016,27 +1004,21 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x8000
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v5, v5, s0, 0x7fff
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v6.l, v5.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v7.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v1.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1209,20 +1191,14 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v1.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v6.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v5.l
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v1.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 01528cdf7c1254..53f1c476e49ee1 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -4210,18 +4210,45 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-TRUE16-LABEL: v_test_v2i16_x_add_undef_neg32:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s0, 0xffffffe0
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_pk_add_u16 v1, v1, s2
+; GFX11-GISEL-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: v_test_v2i16_x_add_undef_neg32:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-GISEL-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll
index 52d882590cbced..3e889c0a0670ad 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll
@@ -82,9 +82,9 @@ define <2 x half> @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x flo
; GFX11-TRUE16-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v1
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict:
@@ -144,11 +144,10 @@ define <3 x half> @v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict(<3 x flo
; GFX11-TRUE16-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.h, v1
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v0
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v2
-; GFX11-TRUE16-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict:
@@ -405,10 +404,9 @@ define void @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi(<2 x flo
; GFX11-TRUE16-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-TRUE16-NEXT: global_store_b32 v[2:3], v0, off
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v1
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-TRUE16-NEXT: global_store_b32 v[2:3], v1, off
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi:
More information about the llvm-commits
mailing list