[flang] [libc] [libcxx] [lldb] [llvm] [clang] [compiler-rt] [clang-tools-extra] AMDGPU: Make v4bf16 a legal type (PR #76217)

Matt Arsenault via cfe-commits cfe-commits at lists.llvm.org
Thu Jan 4 08:25:38 PST 2024


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/76217

>From d9413af34d48f2e4db10d895cd90e2adcc421460 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 19 Dec 2023 14:01:28 +0700
Subject: [PATCH 1/4] AMDGPU: Add bf16 vectors to register class definitions

Assorted intrinsics are currently using i16 in place of a proper
bfloat type, but they should really switch to bfloat.

Note this only changes the type lists in tablegen, these are still
not registered to be truly treated as a legal type yet.
---
 llvm/lib/Target/AMDGPU/SIInstructions.td    | 10 ++++
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td    | 62 ++++++++++-----------
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 22 ++++----
 3 files changed, 52 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f9bc623abcd04b..8310c6b57dad58 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1487,8 +1487,18 @@ foreach Index = 0-31 in {
 // 16-bit bitcast
 def : BitConvert <i16, f16, VGPR_32>;
 def : BitConvert <f16, i16, VGPR_32>;
+def : BitConvert <f16, bf16, VGPR_32>;
+def : BitConvert <bf16, f16, VGPR_32>;
+
 def : BitConvert <i16, f16, SReg_32>;
 def : BitConvert <f16, i16, SReg_32>;
+def : BitConvert <f16, bf16, SReg_32>;
+def : BitConvert <bf16, f16, SReg_32>;
+
+def : BitConvert <i16, bf16, VGPR_32>;
+def : BitConvert <bf16, i16, VGPR_32>;
+def : BitConvert <i16, bf16, SReg_32>;
+def : BitConvert <bf16, i16, SReg_32>;
 
 // 32-bit bitcast
 def : BitConvert <i32, f32, VGPR_32>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 36480057b81949..c94b894c5841f8 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -376,7 +376,7 @@ def M0_CLASS : SIRegisterClass<"AMDGPU", [i32], 32, (add M0)> {
   let HasSGPR = 1;
 }
 
-def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
+def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add M0_LO16)> {
   let CopyCost = 1;
   let Size = 16;
   let isAllocatable = 0;
@@ -385,7 +385,7 @@ def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
 
 // TODO: Do we need to set DwarfRegAlias on register tuples?
 
-def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
+def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
                               (add (sequence "SGPR%u_LO16", 0, 105))> {
   let AllocationPriority = 0;
   let Size = 16;
@@ -393,7 +393,7 @@ def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
   let HasSGPR = 1;
 }
 
-def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
+def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
                               (add (sequence "SGPR%u_HI16", 0, 105))> {
   let isAllocatable = 0;
   let Size = 16;
@@ -402,7 +402,7 @@ def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
 }
 
 // SGPR 32-bit registers
-def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                             (add (sequence "SGPR%u", 0, 105))> {
   // Give all SGPR classes higher priority than VGPR classes, because
   // we want to spill SGPRs to VGPRs.
@@ -451,14 +451,14 @@ def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s"
 def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">;
 
 // Trap handler TMP 32-bit registers
-def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
+def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16, v2bf16], 32,
                             (add (sequence "TTMP%u", 0, 15))> {
   let isAllocatable = 0;
   let HasSGPR = 1;
 }
 
 // Trap handler TMP 16-bit registers
-def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
+def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
                               (add (sequence "TTMP%u_LO16", 0, 15))> {
   let Size = 16;
   let isAllocatable = 0;
@@ -584,8 +584,8 @@ class RegisterTypes<list<ValueType> reg_types> {
   list<ValueType> types = reg_types;
 }
 
-def Reg16Types : RegisterTypes<[i16, f16]>;
-def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
+def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
+def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
 
 let HasVGPR = 1 in {
 // VOP3 and VINTERP can access 256 lo and 256 hi registers.
@@ -683,7 +683,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
 }
 
 // AccVGPR 32-bit registers
-def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                             (add (sequence "AGPR%u", 0, 255))> {
   let AllocationPriority = 0;
   let Size = 32;
@@ -735,7 +735,7 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
 //  Register classes used as source and destination
 //===----------------------------------------------------------------------===//
 
-def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
   (add FP_REG, SP_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
@@ -743,7 +743,7 @@ def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16
   let BaseClassOrder = 10000;
 }
 
-def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16], 32,
+def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16, v8bf16], 32,
   (add PRIVATE_RSRC_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
@@ -760,7 +760,7 @@ def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
 let GeneratePressureSet = 0, HasSGPR = 1 in {
 // Subset of SReg_32 without M0 for SMRD instructions and alike.
 // See comments in SIInstructions.td for more info.
-def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
    SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE_LO,
    SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_SHARED_BASE_HI,
@@ -769,7 +769,7 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2
   let AllocationPriority = 0;
 }
 
-def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
+def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
   (add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16,
    XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, SGPR_NULL_HI_LO16, TTMP_LO16,
    TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO_LO16,
@@ -782,17 +782,17 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
   let BaseClassOrder = 16;
 }
 
-def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SReg_32_XM0_XEXEC, M0_CLASS)> {
   let AllocationPriority = 0;
 }
 
-def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SReg_32_XEXEC, EXEC_LO)> {
   let AllocationPriority = 0;
 }
 
-def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
   let AllocationPriority = 0;
 }
@@ -800,7 +800,7 @@ def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i
 } // End GeneratePressureSet = 0
 
 // Register class for all scalar registers (SGPRs + Special Registers)
-def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
+def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
   (add SReg_32_XM0, M0_CLASS)> {
   let AllocationPriority = 0;
   let HasSGPR = 1;
@@ -808,13 +808,13 @@ def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1],
 }
 
 let GeneratePressureSet = 0 in {
-def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
   (add SReg_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
   let HasSGPR = 1;
 }
 
-def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
+def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, v4bf16], 32,
                             (add SGPR_64Regs)> {
   let CopyCost = 1;
   let AllocationPriority = 1;
@@ -836,13 +836,13 @@ def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
   let HasSGPR = 1;
 }
 
-def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
+def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16, v4bf16], 32,
                             (add TTMP_64Regs)> {
   let isAllocatable = 0;
   let HasSGPR = 1;
 }
 
-def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
+def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
   (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, SRC_SHARED_BASE,
        SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA)> {
   let CopyCost = 1;
@@ -850,7 +850,7 @@ def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16
   let HasSGPR = 1;
 }
 
-def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
+def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
   (add SReg_64_XEXEC, EXEC)> {
   let CopyCost = 1;
   let AllocationPriority = 1;
@@ -905,11 +905,11 @@ multiclass SRegClass<int numRegs,
 }
 
 defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
-defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>;
+defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], SGPR_128Regs, TTMP_128Regs>;
 defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
 defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
 defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
-defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
+defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], SGPR_256Regs, TTMP_256Regs>;
 defm "" : SRegClass<9, [v9i32, v9f32], SGPR_288Regs, TTMP_288Regs>;
 defm "" : SRegClass<10, [v10i32, v10f32], SGPR_320Regs, TTMP_320Regs>;
 defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;
@@ -920,7 +920,7 @@ defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512
 defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
 }
 
-def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                                  (add VGPR_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
@@ -955,15 +955,15 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
   }
 }
 
-defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
+defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4bf16, v4i16, p0, p1, p4],
                                 (add VGPR_64)>;
 defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
-defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add VGPR_128)>;
+defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add VGPR_128)>;
 defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
 
 defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
 defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
-defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>;
+defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], (add VGPR_256)>;
 defm VReg_288 : VRegClass<9, [v9i32, v9f32], (add VGPR_288)>;
 defm VReg_320 : VRegClass<10, [v10i32, v10f32], (add VGPR_320)>;
 defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>;
@@ -993,7 +993,7 @@ multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
 defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
                         (add AGPR_64)>;
 defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
-defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add AGPR_128)>;
+defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add AGPR_128)>;
 defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
 defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
 defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
@@ -1032,14 +1032,14 @@ def VS_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
   let HasVGPR = 1;
 }
 
-def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                           (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
   let HasSGPR = 1;
 }
 
-def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                           (add VGPR_32_Lo128, SReg_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index d3cefb339d9e70..7f52501b5d9031 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -190,9 +190,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
   // because dealing with the write to high half of the register is
   // difficult.
   def : GCNPat <
-    (build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
-                                                (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                                                (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
+    (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+                                                     (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
+                                                     (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
     (v2f16 (mixhi_inst $src0_modifiers, $src0,
                        $src1_modifiers, $src1,
                        $src2_modifiers, $src2,
@@ -203,9 +203,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
   def : GCNPat <
     (build_vector
       f16:$elt0,
-      (AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
+      (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
                                       (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
-                                      (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
+                                      (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
     (v2f16 (mixhi_inst $src0_modifiers, $src0,
                        $src1_modifiers, $src1,
                        $src2_modifiers, $src2,
@@ -215,12 +215,12 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
 
   def : GCNPat <
     (AMDGPUclamp (build_vector
-      (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
+      (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
                          (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
-                         (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
-      (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
+                         (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
+      (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
                          (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
-                         (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
+                         (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
     (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
                        $hi_src1_modifiers, $hi_src1,
                        $hi_src2_modifiers, $hi_src2,
@@ -243,8 +243,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
   >;
 
   def : GCNPat <
-    (build_vector f16:$elt0, (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
-                                            (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
+    (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
+                                            (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
     (v2f16 (mixhi_inst $src0_modifiers, $src0,
                        $src1_modifiers, $src1,
                        (i32 0), (i32 0),

>From da874220064ba6ba8fd02a50aaccd67ca726b23e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 19 Dec 2023 17:20:01 +0700
Subject: [PATCH 2/4] AMDGPU: Make bf16/v2bf16 legal types

There are some intrinsics are using i16 vectors in place of bfloat vectors.
Move towards making bf16 vectors legal so these can migrate. Leave the
larger vectors for a later change.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |    26 +-
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td   |    26 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |     1 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |    37 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |    98 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |     1 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |    42 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 17608 +++++++---------
 llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll     |    12 +-
 .../test/CodeGen/AMDGPU/fmed3-cast-combine.ll |    16 +-
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |    10 +-
 .../CodeGen/AMDGPU/function-args-inreg.ll     |     4 +-
 llvm/test/CodeGen/AMDGPU/function-args.ll     |   167 +-
 llvm/test/CodeGen/AMDGPU/function-returns.ll  |    65 +-
 .../AMDGPU/gfx-callable-argument-types.ll     |   128 +-
 .../isel-amdgpu-cs-chain-preserve-cc.ll       |    16 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          |    25 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp10.ll        |    25 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |    58 +-
 .../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll    |   402 +-
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          |    11 +-
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        |    11 +-
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |    62 +-
 llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll  |     4 +-
 llvm/test/CodeGen/AMDGPU/select-undef.ll      |     3 +-
 .../CodeGen/AMDGPU/vector_shuffle.packed.ll   |   850 +-
 26 files changed, 8957 insertions(+), 10751 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 4e317062cec497..296ed3a3c3dc11 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3199,7 +3199,16 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       return true;
     }
     break;
-  case ISD::FP_ROUND:
+  case ISD::FP_ROUND: {
+    EVT VT = Node->getValueType(0);
+    if (VT.getScalarType() == MVT::bf16) {
+      Results.push_back(
+          DAG.getNode(ISD::FP_TO_BF16, SDLoc(Node), VT, Node->getOperand(0)));
+      break;
+    }
+
+    LLVM_FALLTHROUGH;
+  }
   case ISD::BITCAST:
     if ((Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
                                  Node->getValueType(0), dl)))
@@ -3226,12 +3235,19 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       return true;
     }
     break;
-  case ISD::FP_EXTEND:
-    if ((Tmp1 = EmitStackConvert(Node->getOperand(0),
-                                 Node->getOperand(0).getValueType(),
-                                 Node->getValueType(0), dl)))
+  case ISD::FP_EXTEND: {
+    SDValue Op = Node->getOperand(0);
+    EVT SrcVT = Op.getValueType();
+    EVT DstVT = Node->getValueType(0);
+    if (SrcVT.getScalarType() == MVT::bf16) {
+      Results.push_back(DAG.getNode(ISD::BF16_TO_FP, SDLoc(Node), DstVT, Op));
+      break;
+    }
+
+    if ((Tmp1 = EmitStackConvert(Op, SrcVT, DstVT, dl)))
       Results.push_back(Tmp1);
     break;
+  }
   case ISD::BF16_TO_FP: {
     // Always expand bf16 to f32 casts, they lower to ext + shift.
     //
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 9036b26a6f6bcb..c5207228dc913f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -22,28 +22,28 @@ def CC_SI_Gfx : CallingConv<[
   // 32 is reserved for the stack pointer
   // 33 is reserved for the frame pointer
   // 34 is reserved for the base pointer
-  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
     SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
     SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29
   ]>>>,
 
-  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
     VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
   ]>>>,
 
-  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>
+  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1, bf16, v2bf16], CCAssignToStack<4, 4>>
 ]>;
 
 def RetCC_SI_Gfx : CallingConv<[
   CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
 
-  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -66,7 +66,7 @@ def RetCC_SI_Gfx : CallingConv<[
 
 def CC_SI_SHADER : CallingConv<[
 
-  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@@ -76,7 +76,7 @@ def CC_SI_SHADER : CallingConv<[
   ]>>>,
 
   // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
-  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
+  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -109,7 +109,7 @@ def RetCC_SI_Shader : CallingConv<[
   ]>>,
 
   // 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
-  CCIfType<[f32, f16, v2f16] , CCAssignToReg<[
+  CCIfType<[f32, f16, v2f16, bf16, v2bf16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -188,23 +188,23 @@ def CC_AMDGPU_Func : CallingConv<[
   CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
 
-  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
     !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
   >>>,
 
-  CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
+  CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1, bf16, v2bf16], CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
     VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
-  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>
+  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1, bf16, v2bf16], CCAssignToStack<4, 4>>
 ]>;
 
 // Calling convention for leaf functions
 def RetCC_AMDGPU_Func : CallingConv<[
   CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
-  CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[
+  CCIfType<[i32, f32, i16, f16, v2i16, v2f16, bf16, v2bf16], CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -223,11 +223,11 @@ def CC_AMDGPU : CallingConv<[
 ]>;
 
 def CC_AMDGPU_CS_CHAIN : CallingConv<[
-  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
     !foreach(i, !range(105), !cast<Register>("SGPR"#i))
   >>>,
 
-  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
     !foreach(i, !range(8, 255), !cast<Register>("VGPR"#i))
   >>>
 ]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b0eac567ec9f18..40a49cbe3f518f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -303,6 +303,7 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
 
     switch (N->getOpcode()) {
     case ISD::BUILD_VECTOR:
+      // TODO: Match load d16 from shl (extload:i16), 16
       MadeChange |= matchLoadD16FromBuildVector(N);
       break;
     default:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 541a5b62450ddf..2225bce25edf5c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3282,7 +3282,15 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
     return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
   }
 
-  assert(SrcVT == MVT::i64 && "operation should be legal");
+  if (DestVT == MVT::bf16) {
+    SDLoc SL(Op);
+    SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
+    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
+    return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
+  }
+
+  if (SrcVT != MVT::i64)
+    return Op;
 
   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
     SDLoc DL(Op);
@@ -3320,7 +3328,15 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
     return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
   }
 
-  assert(SrcVT == MVT::i64 && "operation should be legal");
+  if (DestVT == MVT::bf16) {
+    SDLoc SL(Op);
+    SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
+    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
+    return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
+  }
+
+  if (SrcVT != MVT::i64)
+    return Op;
 
   // TODO: Factor out code common with LowerUINT_TO_FP.
 
@@ -3518,7 +3534,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con
   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
 }
 
-SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
+SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
                                              SelectionDAG &DAG) const {
   SDValue Src = Op.getOperand(0);
   unsigned OpOpcode = Op.getOpcode();
@@ -3529,6 +3545,12 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
   if (SrcVT == MVT::f16 && DestVT == MVT::i16)
     return Op;
 
+  if (SrcVT == MVT::bf16) {
+    SDLoc DL(Op);
+    SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
+    return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
+  }
+
   // Promote i16 to i32
   if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
     SDLoc DL(Op);
@@ -3537,6 +3559,9 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
   }
 
+  if (DestVT != MVT::i64)
+    return Op;
+
   if (SrcVT == MVT::f16 ||
       (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
     SDLoc DL(Op);
@@ -3547,7 +3572,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
     return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
   }
 
-  if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
+  if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
     return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
 
   return SDValue();
@@ -4948,7 +4973,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
     if (DestVT.isVector()) {
       SDValue Src = N->getOperand(0);
-      if (Src.getOpcode() == ISD::BUILD_VECTOR) {
+      if (Src.getOpcode() == ISD::BUILD_VECTOR &&
+          (DCI.getDAGCombineLevel() < AfterLegalizeDAG ||
+           isOperationLegal(ISD::BUILD_VECTOR, DestVT))) {
         EVT SrcVT = Src.getValueType();
         unsigned NElts = DestVT.getVectorNumElements();
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f3547db9e9bd94..91f347be68fa96 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -151,14 +151,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     if (Subtarget->useRealTrue16Insts()) {
       addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
       addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
+      addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
     } else {
       addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
       addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
+      addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
     }
 
     // Unless there are also VOP3P operations, not operations are really legal.
     addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
     addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
+    addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
     addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
     addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
     addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
@@ -196,6 +199,41 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                       MVT::i1,     MVT::v32i32},
                      Custom);
 
+  if (isTypeLegal(MVT::bf16)) {
+    for (unsigned Opc :
+         {ISD::FADD,     ISD::FSUB,       ISD::FMUL,    ISD::FDIV,
+          ISD::FREM,     ISD::FMA,        ISD::FMINNUM, ISD::FMAXNUM,
+          ISD::FMINIMUM, ISD::FMAXIMUM,   ISD::FSQRT,   ISD::FCBRT,
+          ISD::FSIN,     ISD::FCOS,       ISD::FPOW,    ISD::FPOWI,
+          ISD::FLDEXP,   ISD::FFREXP,     ISD::FLOG,    ISD::FLOG2,
+          ISD::FLOG10,   ISD::FEXP,       ISD::FEXP2,   ISD::FEXP10,
+          ISD::FCEIL,    ISD::FTRUNC,     ISD::FRINT,   ISD::FNEARBYINT,
+          ISD::FROUND,   ISD::FROUNDEVEN, ISD::FFLOOR,  ISD::FCANONICALIZE,
+          ISD::SETCC}) {
+      // FIXME: The promoted to type shouldn't need to be explicit
+      setOperationAction(Opc, MVT::bf16, Promote);
+      AddPromotedToType(Opc, MVT::bf16, MVT::f32);
+    }
+
+    setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand);
+
+    setOperationAction(ISD::SELECT, MVT::bf16, Promote);
+    AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
+
+    // TODO: Could make these legal
+    setOperationAction(ISD::FABS, MVT::bf16, Expand);
+    setOperationAction(ISD::FNEG, MVT::bf16, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
+
+    // We only need to custom lower because we can't specify an action for bf16
+    // sources.
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Promote);
+    AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16);
+  }
+
   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
   setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
@@ -388,8 +426,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   // Avoid stack access for these.
   // TODO: Generalize to more vector types.
   setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
-                     {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
-                      MVT::v4i16, MVT::v4f16},
+                     {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
+                      MVT::v8i8, MVT::v4i16, MVT::v4f16},
                      Custom);
 
   // Deal with vec3 vector operations when widened to vec4.
@@ -498,6 +536,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
   setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
 
+  // Custom lower these because we can't specify a rule based on an illegal
+  // source bf16.
+  setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
+  setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
+
   if (Subtarget->has16BitInsts()) {
     setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
                         ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
@@ -524,9 +567,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
 
     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);
+    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
+    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
+
+    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom);
 
     // F16 - Constant Actions.
     setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
+    setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
 
     // F16 - Load/Store Actions.
     setOperationAction(ISD::LOAD, MVT::f16, Promote);
@@ -534,16 +582,23 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::STORE, MVT::f16, Promote);
     AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
 
+    // BF16 - Load/Store Actions.
+    setOperationAction(ISD::LOAD, MVT::bf16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
+    setOperationAction(ISD::STORE, MVT::bf16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
+
     // F16 - VOP1 Actions.
     setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
                         ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
                        MVT::f16, Custom);
 
-    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
+    setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
 
     // F16 - VOP2 Actions.
-    setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);
+    setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
+                       Expand);
     setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
     setOperationAction(ISD::FFREXP, MVT::f16, Custom);
     setOperationAction(ISD::FDIV, MVT::f16, Custom);
@@ -554,8 +609,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMAD, MVT::f16, Legal);
 
     for (MVT VT :
-         {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
-          MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16, MVT::v32f16}) {
+         {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
+          MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
+          MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
         switch (Op) {
         case ISD::LOAD:
@@ -587,7 +643,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     // XXX - Do these do anything? Vector constants turn into build_vector.
     setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
 
-    setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16}, Legal);
+    setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
+                       Legal);
 
     setOperationAction(ISD::STORE, MVT::v2i16, Promote);
     AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
@@ -699,7 +756,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                         ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
                        MVT::v2f16, Legal);
 
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16},
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
                        Custom);
 
     setOperationAction(ISD::VECTOR_SHUFFLE,
@@ -3901,6 +3958,26 @@ SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
+// Work around DAG legality rules only based on the result type.
+SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+  bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
+  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+  EVT SrcVT = Src.getValueType();
+
+  if (SrcVT.getScalarType() != MVT::bf16)
+    return Op;
+
+  SDLoc SL(Op);
+  SDValue BitCast =
+      DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
+
+  EVT DstVT = Op.getValueType();
+  if (IsStrict)
+    llvm_unreachable("Need STRICT_BF16_TO_FP");
+
+  return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
+}
+
 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
                                              const MachineFunction &MF) const {
   Register Reg = StringSwitch<Register>(RegName)
@@ -5451,6 +5528,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return lowerGET_ROUNDING(Op, DAG);
   case ISD::PREFETCH:
     return lowerPREFETCH(Op, DAG);
+  case ISD::FP_EXTEND:
+  case ISD::STRICT_FP_EXTEND:
+    return lowerFP_EXTEND(Op, DAG);
   }
   return SDValue();
 }
@@ -6638,7 +6718,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
   SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
 
-  if (ResultVT == MVT::f16) {
+  if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
     SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
     return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 5bc091d6e84de3..00f9ddf11ea7a5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -417,6 +417,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
 
   Register getRegisterByName(const char* RegName, LLT VT,
                              const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 8310c6b57dad58..5d84320c0de0e3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1122,7 +1122,7 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
   >;
 
   def : GCNPat <
-    (f64 (fpextend f16:$src)),
+    (f64 (any_fpextend f16:$src)),
     (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
   >;
 
@@ -1515,6 +1515,23 @@ def : BitConvert <v2f16, f32, SReg_32>;
 def : BitConvert <f32, v2f16, SReg_32>;
 def : BitConvert <v2i16, f32, SReg_32>;
 def : BitConvert <f32, v2i16, SReg_32>;
+def : BitConvert <v2bf16, i32, SReg_32>;
+def : BitConvert <i32, v2bf16, SReg_32>;
+def : BitConvert <v2bf16, i32, VGPR_32>;
+def : BitConvert <i32, v2bf16, VGPR_32>;
+def : BitConvert <v2bf16, v2i16, SReg_32>;
+def : BitConvert <v2i16, v2bf16, SReg_32>;
+def : BitConvert <v2bf16, v2i16, VGPR_32>;
+def : BitConvert <v2i16, v2bf16, VGPR_32>;
+def : BitConvert <v2bf16, v2f16, SReg_32>;
+def : BitConvert <v2f16, v2bf16, SReg_32>;
+def : BitConvert <v2bf16, v2f16, VGPR_32>;
+def : BitConvert <v2f16, v2bf16, VGPR_32>;
+def : BitConvert <f32, v2bf16, VGPR_32>;
+def : BitConvert <v2bf16, f32, VGPR_32>;
+def : BitConvert <f32, v2bf16, SReg_32>;
+def : BitConvert <v2bf16, f32, SReg_32>;
+
 
 // 64-bit bitcast
 def : BitConvert <i64, f64, VReg_64>;
@@ -1958,19 +1975,21 @@ def : GCNPat <
   let SubtargetPredicate = HasPackedFP32Ops;
 }
 
+foreach fp16vt = [f16, bf16] in {
+
 def : GCNPat <
-  (fcopysign f16:$src0, f16:$src1),
+  (fcopysign fp16vt:$src0, fp16vt:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
 >;
 
 def : GCNPat <
-  (fcopysign f32:$src0, f16:$src1),
+  (fcopysign f32:$src0, fp16vt:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
              (V_LSHLREV_B32_e64 (i32 16), $src1))
 >;
 
 def : GCNPat <
-  (fcopysign f64:$src0, f16:$src1),
+  (fcopysign f64:$src0, fp16vt:$src1),
   (REG_SEQUENCE SReg_64,
     (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
     (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
@@ -1978,16 +1997,17 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (fcopysign f16:$src0, f32:$src1),
+  (fcopysign fp16vt:$src0, f32:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
              (V_LSHRREV_B32_e64 (i32 16), $src1))
 >;
 
 def : GCNPat <
-  (fcopysign f16:$src0, f64:$src1),
+  (fcopysign fp16vt:$src0, f64:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
              (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
 >;
+} // End foreach fp16vt = [f16, bf16]
 
 /********** ================== **********/
 /********** Immediate Patterns **********/
@@ -2026,6 +2046,11 @@ def : GCNPat <
   (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
 >;
 
+def : GCNPat <
+  (VGPRImm<(bf16 fpimm)>:$imm),
+  (V_MOV_B32_e32 (bf16 (bitcast_fpimm_to_i32 $imm)))
+>;
+
 // V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit
 // immediate and wil be expanded as needed, but we will only use these patterns
 // for values which can be encoded.
@@ -2059,6 +2084,11 @@ def : GCNPat <
   (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
 >;
 
+def : GCNPat <
+  (bf16 fpimm:$imm),
+  (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
+>;
+
 def : GCNPat <
   (p5 frameindex:$fi),
   (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi)))
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index adc23860e8965d..5243262117e728 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -164,18 +164,13 @@ define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_load_global_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_load_global_v3bf16:
@@ -183,9 +178,6 @@ define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_load_global_v3bf16:
@@ -193,10 +185,6 @@ define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x bfloat>, ptr addrspace(1) %ptr
   ret <3 x bfloat> %load
@@ -2083,29 +2071,29 @@ define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX9-LABEL: test_load_store_bf16_to_f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[2:3], v4, off
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_load_store_bf16_to_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[2:3], v4, off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_load_store_bf16_to_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b32 v[2:3], v4, off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load bfloat, ptr addrspace(1) %in
   %val.f32 = fpext bfloat %val to float
@@ -2158,10 +2146,10 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX9-LABEL: test_load_store_bf16_to_f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2169,20 +2157,21 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX10-LABEL: test_load_store_bf16_to_f64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_load_store_bf16_to_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load bfloat, ptr addrspace(1) %in
@@ -2503,7 +2492,6 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
 ; GFX8-LABEL: test_arg_store:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    flat_store_short v[1:2], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -2511,20 +2499,20 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
 ; GFX9-LABEL: test_arg_store:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_store_short_d16_hi v[1:2], v0, off
+; GFX9-NEXT:    global_store_short v[1:2], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_arg_store:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_store_short_d16_hi v[1:2], v0, off
+; GFX10-NEXT:    global_store_short v[1:2], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_arg_store:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_store_d16_hi_b16 v[1:2], v0, off
+; GFX11-NEXT:    global_store_b16 v[1:2], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %in, ptr addrspace(1) %out
   ret void
@@ -2905,8 +2893,7 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
 ; GFX8-LABEL: test_inreg_arg_store:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s34, s4, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -2915,7 +2902,7 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    global_store_short_d16_hi v[0:1], v2, off
+; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2923,14 +2910,14 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    global_store_short_d16_hi v[0:1], v2, off
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_inreg_arg_store:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s4
-; GFX11-NEXT:    global_store_d16_hi_b16 v[0:1], v2, off
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %in, ptr addrspace(1) %out
   ret void
@@ -2956,28 +2943,27 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
 ; GFX8-LABEL: test_byval:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    buffer_store_short v1, off, s[0:3], s32
+; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], s32
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_byval:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_byval:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_byval:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s32
+; GFX11-NEXT:    scratch_store_b16 off, v0, s32
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %val, ptr addrspace(5) %bv
   %retval = load bfloat, ptr addrspace(5) %bv
@@ -3004,7 +2990,6 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
 ; GFX8-LABEL: test_sret:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -3012,20 +2997,20 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
 ; GFX9-LABEL: test_sret:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_sret:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_sret:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v0, v1, off
+; GFX11-NEXT:    scratch_store_b16 v0, v1, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %val, ptr addrspace(5) %sret
   ret void
@@ -3245,25 +3230,21 @@ define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
 ; GFX8-LABEL: test_ret_v3bf16:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_ret_v3bf16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_ret_v3bf16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_ret_v3bf16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ret <3 x bfloat> %in
@@ -3450,7 +3431,6 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
@@ -3480,7 +3460,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
@@ -3510,7 +3490,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
@@ -3540,7 +3520,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v1, v0, off dlc
+; GFX11-NEXT:    scratch_store_b16 v1, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
@@ -3836,7 +3816,6 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -3869,7 +3848,6 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -3902,7 +3880,6 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -3935,7 +3912,6 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16 at gotpcrel32@hi+12
 ; GFX11-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
@@ -4061,18 +4037,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 4, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    buffer_store_short v1, v6, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v2
-; GFX8-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v2
+; GFX8-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v2
-; GFX8-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
@@ -4101,13 +4069,9 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
@@ -4137,13 +4101,9 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
@@ -4173,18 +4133,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 6, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v1, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v2, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v2, v0, off offset:2 dlc
+; GFX11-NEXT:    scratch_store_b32 v2, v1, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v2, v0, off dlc
+; GFX11-NEXT:    scratch_store_b32 v2, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -4333,32 +4287,16 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 12, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    buffer_store_short v3, v10, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 12, v4
+; GFX8-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 14, v4
-; GFX8-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 10, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v4
-; GFX8-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v4
-; GFX8-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v5, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v5, 0
@@ -4387,21 +4325,13 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v2, v4, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6
+; GFX9-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v5, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v5, 0
@@ -4431,21 +4361,13 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen offset:12
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v2, v4, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6
+; GFX10-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:8
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v5, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
@@ -4475,28 +4397,18 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 14, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 12, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 10, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 6, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 12, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v6, v3, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v7, v3, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v8, v2, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v4, v2, off offset:8 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v9, v1, off dlc
+; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX11-NEXT:    scratch_store_b32 v6, v3, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v4, v1, off offset:4 dlc
+; GFX11-NEXT:    scratch_store_b32 v4, v2, off offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v0, off offset:2 dlc
+; GFX11-NEXT:    scratch_store_b32 v4, v1, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v4, v0, off dlc
+; GFX11-NEXT:    scratch_store_b32 v4, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -4709,60 +4621,28 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 28, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX8-NEXT:    buffer_store_short v7, v18, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 28, v8
+; GFX8-NEXT:    buffer_store_dword v7, v10, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 24, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GFX8-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 20, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
-; GFX8-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX8-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 12, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX8-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX8-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX8-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 30, v8
-; GFX8-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 26, v8
-; GFX8-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 22, v8
-; GFX8-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 18, v8
-; GFX8-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 14, v8
-; GFX8-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 10, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX8-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v8
-; GFX8-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v8
-; GFX8-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v9, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v9, 0
@@ -4791,37 +4671,21 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26
+; GFX9-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v6, v8, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22
+; GFX9-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v5, v8, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18
+; GFX9-NEXT:    buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v4, v8, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14
+; GFX9-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v3, v8, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v2, v8, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v9, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v9, 0
@@ -4851,37 +4715,21 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen offset:28
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26
+; GFX10-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen offset:28
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v6, v8, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen offset:24
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22
+; GFX10-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen offset:20
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v5, v8, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    buffer_store_dword v4, v8, s[0:3], 0 offen offset:16
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18
+; GFX10-NEXT:    buffer_store_dword v3, v8, s[0:3], 0 offen offset:12
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v4, v8, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    buffer_store_dword v2, v8, s[0:3], 0 offen offset:8
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14
+; GFX10-NEXT:    buffer_store_dword v1, v8, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v3, v8, s[0:3], 0 offen offset:12
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v2, v8, s[0:3], 0 offen offset:8
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v9, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
@@ -4911,51 +4759,28 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 30, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 28, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 28, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 24, v8
 ; GFX11-NEXT:    v_add_nc_u32_e32 v12, 20, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 18, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 12, v8
 ; GFX11-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v10, v7, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v11, v7, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 26, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 24, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 22, v8
-; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v7, v6, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v10, v6, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v11, v5, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v12, v5, off dlc
+; GFX11-NEXT:    scratch_store_b32 v10, v7, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v13, v4, off dlc
+; GFX11-NEXT:    scratch_store_b32 v11, v6, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 14, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 12, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 10, v8
-; GFX11-NEXT:    scratch_store_b16 v8, v4, off offset:16 dlc
+; GFX11-NEXT:    scratch_store_b32 v12, v5, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 6, v8
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v5, v3, off dlc
+; GFX11-NEXT:    scratch_store_b32 v8, v4, off offset:16 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v6, v3, off dlc
+; GFX11-NEXT:    scratch_store_b32 v13, v3, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v7, v2, off dlc
+; GFX11-NEXT:    scratch_store_b32 v8, v2, off offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v8, v2, off offset:8 dlc
+; GFX11-NEXT:    scratch_store_b32 v8, v1, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v1, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v8, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v8, v0, off offset:2 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v8, v0, off dlc
+; GFX11-NEXT:    scratch_store_b32 v8, v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -4995,44 +4820,37 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) {
 ; GFX8-LABEL: test_alloca_load_store_ret:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], s32
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_alloca_load_store_ret:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s32 glc
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_alloca_load_store_ret:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], s32 glc dlc
+; GFX10-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_alloca_load_store_ret:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 dlc
+; GFX11-NEXT:    scratch_store_b16 off, v0, s32 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, off, s32 glc dlc
+; GFX11-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %in.addr = alloca bfloat, align 2, addrspace(5)
@@ -5219,7 +5037,6 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX8-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
 ; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x7c, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
@@ -5334,7 +5151,7 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:120
 ; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:128
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5380,7 +5197,7 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:120
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:116
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:128
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_overflow_stack:
@@ -5396,7 +5213,7 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX11-NEXT:    scratch_store_b128 off, v[10:13], s0 offset:32
 ; GFX11-NEXT:    scratch_store_b128 off, v[6:9], s0 offset:16
 ; GFX11-NEXT:    scratch_store_b128 off, v[2:5], s0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v1, s0 offset:128
+; GFX11-NEXT:    scratch_store_b16 off, v1, s0 offset:128
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
 ; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
@@ -5521,11 +5338,11 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v3bf16_to_v3f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v3bf16_to_v3f32:
@@ -5597,12 +5414,12 @@ define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v4bf16_to_v4f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v4bf16_to_v4f32:
@@ -5687,43 +5504,45 @@ define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v5bf16_to_v5f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
+; GFX9-NEXT:    global_load_ushort v4, v[0:1], off offset:8
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v5bf16_to_v5f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
+; GFX10-NEXT:    global_load_ushort v4, v[0:1], off offset:8
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_extload_v5bf16_to_v5f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off offset:8
+; GFX11-NEXT:    global_load_u16 v4, v[0:1], off offset:8
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <5 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <5 x bfloat> %load to <5 x float>
@@ -5781,40 +5600,40 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v6bf16_to_v6f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx3 v[3:5], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v6bf16_to_v6f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx3 v[3:5], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_extload_v6bf16_to_v6f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b96 v[4:6], v[0:1], off
+; GFX11-NEXT:    global_load_b96 v[3:5], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <6 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <6 x bfloat> %load to <6 x float>
@@ -5878,46 +5697,46 @@ define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v8bf16_to_v8f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v8bf16_to_v8f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_extload_v8bf16_to_v8f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <8 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <8 x bfloat> %load to <8 x float>
@@ -6013,78 +5832,78 @@ define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v16bf16_to_v16f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v16bf16_to_v16f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_extload_v16bf16_to_v16f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[16:19], v[0:1], off
-; GFX11-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:16
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <16 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <16 x bfloat> %load to <16 x float>
@@ -6244,138 +6063,138 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v32bf16_to_v32f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:48
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v32
-; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v33
-; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v34
-; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v35
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v32
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v33
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v34
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v32bf16_to_v32f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
 ; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v34
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v36
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v37
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v48
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v50
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v52
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v53
-; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v54
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v35
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v36
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v38
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v49
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v50
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v51
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v52
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_extload_v32bf16_to_v32f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_load_b128 v[32:35], v[0:1], off
-; GFX11-NEXT:    global_load_b128 v[36:39], v[0:1], off offset:16
-; GFX11-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:32
-; GFX11-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:48
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:16
+; GFX11-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:32
+; GFX11-NEXT:    global_load_b128 v[28:31], v[0:1], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v34
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v35
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v36
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v37
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v39
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v48
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v50
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v51
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v52
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v53
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v54
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v55
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v34
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v49
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v50
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v51
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v52
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v53
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <32 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <32 x bfloat> %load to <32 x float>
@@ -6616,12 +6435,12 @@ define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
 ; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
 ; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v4bf16_to_v4f64:
@@ -6731,62 +6550,60 @@ define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: global_extload_v5bf16_to_v5f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:8
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v4, v[0:1], off offset:8
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v5bf16_to_v5f64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
+; GFX10-NEXT:    global_load_ushort v4, v[0:1], off offset:8
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_extload_v5bf16_to_v5f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off offset:8
+; GFX11-NEXT:    global_load_u16 v4, v[0:1], off offset:8
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <5 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <5 x bfloat> %load to <5 x double>
@@ -6864,18 +6681,18 @@ define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v5
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v6bf16_to_v6f64:
@@ -7003,22 +6820,22 @@ define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v5
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v8
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v9
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v12
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v13
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v8bf16_to_v8f64:
@@ -7211,39 +7028,39 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v1
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v12
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v13
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[22:23], v16
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[26:27], v17
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[30:31], v20
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v21
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v24
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v25
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v28
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v29
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[20:21], v32
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[24:25], v33
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[28:29], v34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
+; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
+; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_extload_v16bf16_to_v16f64:
@@ -7253,39 +7070,39 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
 ; GFX10-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx4 v[9:12], v[0:1], off offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
 ; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v12
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v6
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v15
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[12:13], v18
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[18:19], v16
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[22:23], v17
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[16:17], v24
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[26:27], v20
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[30:31], v21
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[20:21], v25
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[24:25], v28
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[28:29], v29
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_extload_v16bf16_to_v16f64:
@@ -8717,38 +8534,38 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fadd_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd bfloat %a, %b
   ret bfloat %op
@@ -8784,53 +8601,53 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
@@ -8874,69 +8691,82 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_add_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
@@ -8988,85 +8818,82 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    v_add_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_add_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_add_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v2, v7, v6
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_add_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -9150,143 +8977,138 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v7, v9, v7
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v5, v9, v5
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    v_add_f32_e32 v4, v8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v9, v10, v9
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v10, v11, v10
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX9-NEXT:    v_add_f32_e32 v11, v12, v11
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v4, v8, v4
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
-; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_add_f32_e32 v9, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_add_f32_e32 v10, v13, v12
-; GFX10-NEXT:    v_add_f32_e32 v11, v15, v14
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_add_f32_e32 v4, v11, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_add_f32_e32 v5, v10, v9
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
-; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_add_f32_e32 v6, v12, v11
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v9, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-NEXT:    v_add_f32_e32 v4, v11, v10
+; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_add_f32_e32 v5, v10, v9
+; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT:    v_add_f32_e32 v6, v12, v11
 ; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v9, v10, v9
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v12, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT:    v_dual_add_f32 v10, v12, v11 :: v_dual_add_f32 v11, v14, v13
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
@@ -9438,244 +9260,252 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v15, v17, v15
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v14, v17, v14
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v13, v17, v13
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v12, v17, v12
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v11, v17, v11
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v9, v17, v9
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT:    v_add_f32_e32 v8, v16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_add_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX9-NEXT:    v_add_f32_e32 v17, v18, v17
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
-; GFX9-NEXT:    v_add_f32_e32 v18, v19, v18
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
-; GFX9-NEXT:    v_add_f32_e32 v19, v20, v19
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v20, v21, v20
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v21, v22, v21
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v22, v23, v22
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v23, v24, v23
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v8, v16, v8
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v9
+; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v9
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v9
+; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v9
+; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v9
+; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
-; GFX10-NEXT:    v_add_f32_e32 v17, v18, v17
-; GFX10-NEXT:    v_add_f32_e32 v18, v20, v19
-; GFX10-NEXT:    v_add_f32_e32 v19, v22, v21
-; GFX10-NEXT:    v_add_f32_e32 v20, v24, v23
-; GFX10-NEXT:    v_add_f32_e32 v21, v26, v25
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_add_f32_e32 v22, v23, v22
-; GFX10-NEXT:    v_add_f32_e32 v23, v25, v24
-; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_add_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_add_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
+; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v10
+; GFX10-NEXT:    v_add_f32_e32 v9, v16, v11
 ; GFX10-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT:    v_add_f32_e32 v10, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v11
+; GFX10-NEXT:    v_add_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_add_f32_e32 v12, v17, v16
+; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v17, 0xffff0000, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v4, v4, v12 :: v_dual_add_f32 v5, v5, v13
-; GFX11-NEXT:    v_dual_add_f32 v17, v18, v17 :: v_dual_add_f32 v18, v20, v19
-; GFX11-NEXT:    v_add_f32_e32 v19, v22, v21
-; GFX11-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX11-NEXT:    v_add_f32_e32 v21, v26, v25
-; GFX11-NEXT:    v_dual_add_f32 v6, v6, v14 :: v_dual_and_b32 v25, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v20, v24, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_dual_add_f32 v2, v2, v10 :: v_dual_add_f32 v3, v3, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v9 :: v_dual_add_f32 v22, v23, v22
-; GFX11-NEXT:    v_add_f32_e32 v23, v25, v24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
+; GFX11-NEXT:    v_add_f32_e32 v9, v20, v19
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT:    v_add_f32_e32 v8, v18, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_dual_add_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_add_f32 v9, v16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v11
+; GFX11-NEXT:    v_add_f32_e32 v11, v13, v12
+; GFX11-NEXT:    v_add_f32_e32 v12, v17, v16
+; GFX11-NEXT:    v_dual_add_f32 v6, v6, v14 :: v_dual_add_f32 v7, v7, v15
+; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <16 x bfloat> %a, %b
   ret <16 x bfloat> %op
@@ -10083,493 +9913,484 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
 ; GFX8-NEXT:    v_add_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_add_f32_e32 v30, v32, v30
-; GFX8-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_add_f32_e32 v29, v32, v29
-; GFX8-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_add_f32_e32 v28, v32, v28
-; GFX8-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_add_f32_e32 v27, v32, v27
-; GFX8-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_add_f32_e32 v26, v32, v26
-; GFX8-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
-; GFX8-NEXT:    v_add_f32_e32 v25, v32, v25
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX8-NEXT:    v_perm_b32 v10, v10, v27, s4
-; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
-; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX8-NEXT:    v_add_f32_e32 v32, v32, v33
-; GFX8-NEXT:    v_add_f32_e32 v15, v15, v24
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_add_f32_e32 v24, v33, v24
-; GFX8-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v23, v33, v23
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v22, v33, v22
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v21, v33, v21
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v20, v33, v20
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v19, v33, v19
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v17, v33, v17
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX8-NEXT:    v_perm_b32 v15, v15, v32, s4
+; GFX8-NEXT:    v_add_f32_e32 v16, v31, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v17
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_add_f32_e32 v7, v7, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_add_f32_e32 v8, v8, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_add_f32_e32 v9, v9, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_add_f32_e32 v10, v10, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_add_f32_e32 v11, v11, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_add_f32_e32 v12, v12, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_add_f32_e32 v13, v13, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_add_f32_e32 v14, v14, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_add_f32_e32 v15, v15, v17
+; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v58, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v59, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v44, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v45, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v46, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v47, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v56, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v57, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v38, v39, v38
-; GFX9-NEXT:    v_add_f32_e32 v39, v49, v48
-; GFX9-NEXT:    v_add_f32_e32 v48, v51, v50
-; GFX9-NEXT:    v_add_f32_e32 v51, v41, v40
-; GFX9-NEXT:    v_add_f32_e32 v40, v59, v58
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_add_f32_e32 v49, v53, v52
-; GFX9-NEXT:    v_add_f32_e32 v50, v55, v54
-; GFX9-NEXT:    v_add_f32_e32 v52, v43, v42
-; GFX9-NEXT:    v_add_f32_e32 v53, v45, v44
-; GFX9-NEXT:    v_add_f32_e32 v54, v47, v46
-; GFX9-NEXT:    v_add_f32_e32 v55, v57, v56
-; GFX9-NEXT:    v_perm_b32 v1, v1, v40, s4
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
-; GFX9-NEXT:    v_add_f32_e32 v32, v33, v32
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v16
-; GFX9-NEXT:    v_add_f32_e32 v34, v35, v34
-; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v36, v37, v36
-; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_add_f32_e32 v33, v35, v33
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v31, v32, v31
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_perm_b32 v0, v0, v33, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v55, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v54, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v53, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v52, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v51, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v50, s4
-; GFX9-NEXT:    v_perm_b32 v8, v8, v49, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v48, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v39, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v38, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v36, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v34, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v32, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v17
+; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v17
+; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v17
+; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v17
+; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v17
+; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v17
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_add_f32_e32 v8, v8, v17
+; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_add_f32_e32 v9, v9, v17
+; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_add_f32_e32 v10, v10, v17
+; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_add_f32_e32 v11, v11, v17
+; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_add_f32_e32 v12, v12, v17
+; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_add_f32_e32 v13, v13, v17
+; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_add_f32_e32 v14, v14, v17
+; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v31
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; GFX9-NEXT:    v_add_f32_e32 v35, v37, v35
-; GFX9-NEXT:    v_add_f32_e32 v15, v15, v31
-; GFX9-NEXT:    v_perm_b32 v15, v15, v35, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_add_f32_e32 v15, v15, v17
+; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX10-NEXT:    v_add_f32_e32 v53, v54, v53
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
-; GFX10-NEXT:    v_add_f32_e32 v55, v64, v55
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
-; GFX10-NEXT:    v_add_f32_e32 v65, v66, v65
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
-; GFX10-NEXT:    v_add_f32_e32 v67, v68, v67
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX10-NEXT:    v_add_f32_e32 v33, v34, v33
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
-; GFX10-NEXT:    v_add_f32_e32 v35, v36, v35
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v37, v38, v37
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
-; GFX10-NEXT:    v_add_f32_e32 v39, v48, v39
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
-; GFX10-NEXT:    v_add_f32_e32 v49, v50, v49
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
-; GFX10-NEXT:    v_add_f32_e32 v51, v52, v51
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_add_f32_e32 v21, v53, v52
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_add_f32_e32 v22, v55, v54
+; GFX10-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX10-NEXT:    v_add_f32_e32 v32, v33, v32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX10-NEXT:    v_add_f32_e32 v34, v35, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_add_f32_e32 v34, v36, v34
-; GFX10-NEXT:    v_add_f32_e32 v36, v48, v38
-; GFX10-NEXT:    v_add_f32_e32 v38, v52, v50
-; GFX10-NEXT:    v_add_f32_e32 v48, v64, v54
-; GFX10-NEXT:    v_add_f32_e32 v50, v68, v66
-; GFX10-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
+; GFX10-NEXT:    v_add_f32_e32 v36, v37, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
+; GFX10-NEXT:    v_add_f32_e32 v38, v39, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
+; GFX10-NEXT:    v_add_f32_e32 v48, v49, v48
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX10-NEXT:    v_add_f32_e32 v50, v51, v50
+; GFX10-NEXT:    v_add_f32_e32 v23, v65, v64
+; GFX10-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_add_f32_e32 v24, v67, v66
+; GFX10-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_add_f32_e32 v25, v33, v68
+; GFX10-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_add_f32_e32 v16, v35, v16
+; GFX10-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_add_f32_e32 v17, v37, v17
+; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_add_f32_e32 v18, v39, v18
+; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_add_f32_e32 v19, v49, v19
+; GFX10-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_add_f32_e32 v16, v32, v16
-; GFX10-NEXT:    v_add_f32_e32 v15, v15, v17
-; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
+; GFX10-NEXT:    v_add_f32_e32 v20, v20, v21
+; GFX10-NEXT:    v_add_f32_e32 v15, v15, v22
+; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_and_b32_e32 v82, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_dual_add_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
+; GFX11-NEXT:    v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v80, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v67, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v38, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX11-NEXT:    v_dual_add_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v15, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_dual_add_f32 v33, v34, v33 :: v_dual_add_f32 v34, v36, v35
-; GFX11-NEXT:    v_dual_add_f32 v35, v38, v37 :: v_dual_add_f32 v12, v12, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_add_f32 v5, v5, v21
-; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_add_f32 v9, v9, v25
+; GFX11-NEXT:    v_add_f32_e32 v25, v69, v68
+; GFX11-NEXT:    v_dual_add_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
+; GFX11-NEXT:    v_add_f32_e32 v27, v81, v80
+; GFX11-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX11-NEXT:    v_dual_add_f32 v28, v83, v82 :: v_dual_add_f32 v29, v85, v84
+; GFX11-NEXT:    v_dual_add_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_add_f32_e32 v22, v55, v54
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_dual_add_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_add_f32_e32 v23, v65, v64
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_dual_add_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
+; GFX11-NEXT:    v_add_f32_e32 v18, v39, v38
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_add_f32_e32 v19, v49, v48
+; GFX11-NEXT:    v_add_f32_e32 v17, v37, v36
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_add_f32_e32 v21, v53, v52
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_add_f32_e32 v16, v35, v34
+; GFX11-NEXT:    v_add_f32_e32 v32, v33, v32
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v16, v32, v16 :: v_dual_add_f32 v13, v13, v29
-; GFX11-NEXT:    v_dual_add_f32 v15, v15, v17 :: v_dual_add_f32 v14, v14, v30
-; GFX11-NEXT:    v_add_f32_e32 v36, v48, v39
-; GFX11-NEXT:    v_dual_add_f32 v48, v64, v55 :: v_dual_add_f32 v37, v50, v49
-; GFX11-NEXT:    v_add_f32_e32 v50, v68, v67
-; GFX11-NEXT:    v_dual_add_f32 v38, v52, v51 :: v_dual_add_f32 v51, v70, v69
-; GFX11-NEXT:    v_dual_add_f32 v52, v80, v71 :: v_dual_add_f32 v39, v54, v53
-; GFX11-NEXT:    v_dual_add_f32 v53, v82, v81 :: v_dual_add_f32 v54, v84, v83
-; GFX11-NEXT:    v_add_f32_e32 v55, v86, v85
-; GFX11-NEXT:    v_add_f32_e32 v49, v66, v65
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
+; GFX11-NEXT:    v_add_f32_e32 v15, v15, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <32 x bfloat> %a, %b
   ret <32 x bfloat> %op
@@ -10595,34 +10416,34 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX8-LABEL: v_fadd_bf16_fpimm_0:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_bf16_fpimm_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_bf16_fpimm_0:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_bf16_fpimm_0:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd bfloat %arg0, 1.0
   ret bfloat %add
@@ -10648,34 +10469,34 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX8-LABEL: v_fadd_bf16_fpimm_1:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_bf16_fpimm_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_bf16_fpimm_1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_bf16_fpimm_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd bfloat %arg0, 42.0
   ret bfloat %add
@@ -10703,38 +10524,38 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fsub_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fsub_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub bfloat %a, %b
   ret bfloat %op
@@ -10770,53 +10591,53 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-LABEL: v_fsub_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fsub_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_sub_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
@@ -10860,69 +10681,82 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_fsub_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_sub_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_sub_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_sub_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fsub_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_sub_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
@@ -10974,85 +10808,82 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_fsub_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    v_sub_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_sub_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_sub_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_f32_e32 v2, v7, v6
+; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fsub_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_sub_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -11080,38 +10911,38 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fmul_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul bfloat %a, %b
   ret bfloat %op
@@ -11147,53 +10978,53 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
@@ -11237,69 +11068,82 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_mul_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_mul_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
@@ -11351,85 +11195,82 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_mul_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_mul_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f32_e32 v2, v7, v6
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_mul_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -11513,143 +11354,138 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v7, v9, v7
-; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v5, v9, v5
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    v_mul_f32_e32 v4, v8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v9, v10, v9
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX9-NEXT:    v_mul_f32_e32 v10, v11, v10
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX9-NEXT:    v_mul_f32_e32 v11, v12, v11
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v4, v8, v4
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
-; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_mul_f32_e32 v9, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_mul_f32_e32 v10, v13, v12
-; GFX10-NEXT:    v_mul_f32_e32 v11, v15, v14
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_mul_f32_e32 v4, v11, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_mul_f32_e32 v5, v10, v9
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
-; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f32_e32 v6, v12, v11
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_and_b32 v9, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX11-NEXT:    v_mul_f32_e32 v4, v11, v10
+; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_mul_f32_e32 v5, v10, v9
+; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX11-NEXT:    v_mul_f32_e32 v6, v12, v11
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v9, v10, v9
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v5 :: v_dual_and_b32 v12, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v5
-; GFX11-NEXT:    v_dual_mul_f32 v10, v12, v11 :: v_dual_mul_f32 v11, v14, v13
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
@@ -11801,244 +11637,252 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v15, v17, v15
-; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v14, v17, v14
-; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v13, v17, v13
-; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v12, v17, v12
-; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v11, v17, v11
-; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v9, v17, v9
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT:    v_mul_f32_e32 v8, v16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX9-NEXT:    v_mul_f32_e32 v17, v18, v17
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
-; GFX9-NEXT:    v_mul_f32_e32 v18, v19, v18
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
-; GFX9-NEXT:    v_mul_f32_e32 v19, v20, v19
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
-; GFX9-NEXT:    v_mul_f32_e32 v20, v21, v20
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
-; GFX9-NEXT:    v_mul_f32_e32 v21, v22, v21
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v22, v23, v22
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v23, v24, v23
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v8, v16, v8
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v9
+; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v9
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v9
+; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v9
+; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v9
+; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v9
+; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
-; GFX10-NEXT:    v_mul_f32_e32 v17, v18, v17
-; GFX10-NEXT:    v_mul_f32_e32 v18, v20, v19
-; GFX10-NEXT:    v_mul_f32_e32 v19, v22, v21
-; GFX10-NEXT:    v_mul_f32_e32 v20, v24, v23
-; GFX10-NEXT:    v_mul_f32_e32 v21, v26, v25
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_mul_f32_e32 v22, v23, v22
-; GFX10-NEXT:    v_mul_f32_e32 v23, v25, v24
-; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_mul_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_mul_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
+; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v10
+; GFX10-NEXT:    v_mul_f32_e32 v9, v16, v11
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT:    v_mul_f32_e32 v10, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v11
+; GFX10-NEXT:    v_mul_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_mul_f32_e32 v12, v17, v16
+; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v17, 0xffff0000, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v12 :: v_dual_mul_f32 v5, v5, v13
-; GFX11-NEXT:    v_dual_mul_f32 v17, v18, v17 :: v_dual_mul_f32 v18, v20, v19
-; GFX11-NEXT:    v_mul_f32_e32 v19, v22, v21
-; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GFX11-NEXT:    v_mul_f32_e32 v21, v26, v25
-; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v14 :: v_dual_and_b32 v25, 0xffff0000, v0
-; GFX11-NEXT:    v_mul_f32_e32 v20, v24, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v10 :: v_dual_mul_f32 v3, v3, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v9 :: v_dual_mul_f32 v22, v23, v22
-; GFX11-NEXT:    v_mul_f32_e32 v23, v25, v24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
+; GFX11-NEXT:    v_mul_f32_e32 v9, v20, v19
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT:    v_mul_f32_e32 v8, v18, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_dual_mul_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_mul_f32 v9, v16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v11
+; GFX11-NEXT:    v_mul_f32_e32 v11, v13, v12
+; GFX11-NEXT:    v_mul_f32_e32 v12, v17, v16
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v14 :: v_dual_mul_f32 v7, v7, v15
+; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <16 x bfloat> %a, %b
   ret <16 x bfloat> %op
@@ -12446,493 +12290,484 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v16
 ; GFX8-NEXT:    v_mul_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_mul_f32_e32 v30, v32, v30
-; GFX8-NEXT:    v_mul_f32_e32 v13, v13, v29
-; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_mul_f32_e32 v29, v32, v29
-; GFX8-NEXT:    v_mul_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_mul_f32_e32 v28, v32, v28
-; GFX8-NEXT:    v_mul_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_mul_f32_e32 v27, v32, v27
-; GFX8-NEXT:    v_mul_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_mul_f32_e32 v26, v32, v26
-; GFX8-NEXT:    v_mul_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_mul_f32_e32 v8, v8, v24
-; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
-; GFX8-NEXT:    v_mul_f32_e32 v25, v32, v25
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX8-NEXT:    v_perm_b32 v10, v10, v27, s4
-; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
-; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX8-NEXT:    v_mul_f32_e32 v32, v32, v33
-; GFX8-NEXT:    v_mul_f32_e32 v15, v15, v24
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_mul_f32_e32 v24, v33, v24
-; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v23, v33, v23
-; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v22, v33, v22
-; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v21, v33, v21
-; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v20, v33, v20
-; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v19, v33, v19
-; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v17, v33, v17
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX8-NEXT:    v_perm_b32 v15, v15, v32, s4
+; GFX8-NEXT:    v_mul_f32_e32 v16, v31, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v17
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_mul_f32_e32 v8, v8, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_mul_f32_e32 v9, v9, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_mul_f32_e32 v10, v10, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_mul_f32_e32 v11, v11, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_mul_f32_e32 v12, v12, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_mul_f32_e32 v13, v13, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_mul_f32_e32 v14, v14, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_mul_f32_e32 v15, v15, v17
+; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v58, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v59, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v44, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v45, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v46, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v47, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v56, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v57, 0xffff0000, v2
-; GFX9-NEXT:    v_mul_f32_e32 v38, v39, v38
-; GFX9-NEXT:    v_mul_f32_e32 v39, v49, v48
-; GFX9-NEXT:    v_mul_f32_e32 v48, v51, v50
-; GFX9-NEXT:    v_mul_f32_e32 v51, v41, v40
-; GFX9-NEXT:    v_mul_f32_e32 v40, v59, v58
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_mul_f32_e32 v49, v53, v52
-; GFX9-NEXT:    v_mul_f32_e32 v50, v55, v54
-; GFX9-NEXT:    v_mul_f32_e32 v52, v43, v42
-; GFX9-NEXT:    v_mul_f32_e32 v53, v45, v44
-; GFX9-NEXT:    v_mul_f32_e32 v54, v47, v46
-; GFX9-NEXT:    v_mul_f32_e32 v55, v57, v56
-; GFX9-NEXT:    v_perm_b32 v1, v1, v40, s4
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
-; GFX9-NEXT:    v_mul_f32_e32 v32, v33, v32
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v16
-; GFX9-NEXT:    v_mul_f32_e32 v34, v35, v34
-; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v36, v37, v36
-; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_mul_f32_e32 v33, v35, v33
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v29
-; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_mul_f32_e32 v31, v32, v31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_perm_b32 v0, v0, v33, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v55, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v54, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v53, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v52, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v51, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v50, s4
-; GFX9-NEXT:    v_perm_b32 v8, v8, v49, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v48, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v39, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v38, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v36, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v34, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v32, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v17
+; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v17
+; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v17
+; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v17
+; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v17
+; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v17
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v17
+; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v17
+; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v17
+; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v17
+; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v17
+; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v17
+; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v17
+; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v31
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; GFX9-NEXT:    v_mul_f32_e32 v35, v37, v35
-; GFX9-NEXT:    v_mul_f32_e32 v15, v15, v31
-; GFX9-NEXT:    v_perm_b32 v15, v15, v35, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_mul_f32_e32 v15, v15, v17
+; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX10-NEXT:    v_mul_f32_e32 v53, v54, v53
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
-; GFX10-NEXT:    v_mul_f32_e32 v55, v64, v55
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
-; GFX10-NEXT:    v_mul_f32_e32 v65, v66, v65
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
-; GFX10-NEXT:    v_mul_f32_e32 v67, v68, v67
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX10-NEXT:    v_mul_f32_e32 v33, v34, v33
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
-; GFX10-NEXT:    v_mul_f32_e32 v35, v36, v35
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
-; GFX10-NEXT:    v_mul_f32_e32 v37, v38, v37
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
-; GFX10-NEXT:    v_mul_f32_e32 v39, v48, v39
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
-; GFX10-NEXT:    v_mul_f32_e32 v49, v50, v49
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
-; GFX10-NEXT:    v_mul_f32_e32 v51, v52, v51
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_mul_f32_e32 v21, v53, v52
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_mul_f32_e32 v22, v55, v54
+; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX10-NEXT:    v_mul_f32_e32 v32, v33, v32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX10-NEXT:    v_mul_f32_e32 v34, v35, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_mul_f32_e32 v34, v36, v34
-; GFX10-NEXT:    v_mul_f32_e32 v36, v48, v38
-; GFX10-NEXT:    v_mul_f32_e32 v38, v52, v50
-; GFX10-NEXT:    v_mul_f32_e32 v48, v64, v54
-; GFX10-NEXT:    v_mul_f32_e32 v50, v68, v66
-; GFX10-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_mul_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_mul_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_mul_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_mul_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_mul_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_mul_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
+; GFX10-NEXT:    v_mul_f32_e32 v36, v37, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
+; GFX10-NEXT:    v_mul_f32_e32 v38, v39, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
+; GFX10-NEXT:    v_mul_f32_e32 v48, v49, v48
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX10-NEXT:    v_mul_f32_e32 v50, v51, v50
+; GFX10-NEXT:    v_mul_f32_e32 v23, v65, v64
+; GFX10-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_mul_f32_e32 v24, v67, v66
+; GFX10-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_mul_f32_e32 v25, v33, v68
+; GFX10-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_mul_f32_e32 v16, v35, v16
+; GFX10-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_mul_f32_e32 v17, v37, v17
+; GFX10-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_mul_f32_e32 v18, v39, v18
+; GFX10-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_mul_f32_e32 v19, v49, v19
+; GFX10-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_mul_f32_e32 v16, v32, v16
-; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v17
-; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
+; GFX10-NEXT:    v_mul_f32_e32 v20, v20, v21
+; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v22
+; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_and_b32_e32 v82, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
-; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_dual_mul_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
+; GFX11-NEXT:    v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v80, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v67, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v19 :: v_dual_and_b32 v38, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_mul_f32_e32 v9, v9, v25
-; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v15, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_dual_mul_f32 v33, v34, v33 :: v_dual_mul_f32 v34, v36, v35
-; GFX11-NEXT:    v_dual_mul_f32 v35, v38, v37 :: v_dual_mul_f32 v12, v12, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_mul_f32 v5, v5, v21
-; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_mul_f32 v9, v9, v25
+; GFX11-NEXT:    v_mul_f32_e32 v25, v69, v68
+; GFX11-NEXT:    v_dual_mul_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
+; GFX11-NEXT:    v_mul_f32_e32 v27, v81, v80
+; GFX11-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX11-NEXT:    v_dual_mul_f32 v28, v83, v82 :: v_dual_mul_f32 v29, v85, v84
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_mul_f32_e32 v22, v55, v54
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_dual_mul_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_mul_f32_e32 v23, v65, v64
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_dual_mul_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
+; GFX11-NEXT:    v_mul_f32_e32 v18, v39, v38
+; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_mul_f32_e32 v19, v49, v48
+; GFX11-NEXT:    v_mul_f32_e32 v17, v37, v36
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX11-NEXT:    v_dual_mul_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_mul_f32_e32 v21, v53, v52
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_mul_f32_e32 v16, v35, v34
+; GFX11-NEXT:    v_mul_f32_e32 v32, v33, v32
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v16, v32, v16 :: v_dual_mul_f32 v13, v13, v29
-; GFX11-NEXT:    v_dual_mul_f32 v15, v15, v17 :: v_dual_mul_f32 v14, v14, v30
-; GFX11-NEXT:    v_mul_f32_e32 v36, v48, v39
-; GFX11-NEXT:    v_dual_mul_f32 v48, v64, v55 :: v_dual_mul_f32 v37, v50, v49
-; GFX11-NEXT:    v_mul_f32_e32 v50, v68, v67
-; GFX11-NEXT:    v_dual_mul_f32 v38, v52, v51 :: v_dual_mul_f32 v51, v70, v69
-; GFX11-NEXT:    v_dual_mul_f32 v52, v80, v71 :: v_dual_mul_f32 v39, v54, v53
-; GFX11-NEXT:    v_dual_mul_f32 v53, v82, v81 :: v_dual_mul_f32 v54, v84, v83
-; GFX11-NEXT:    v_mul_f32_e32 v55, v86, v85
-; GFX11-NEXT:    v_mul_f32_e32 v49, v66, v65
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
+; GFX11-NEXT:    v_mul_f32_e32 v15, v15, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <32 x bfloat> %a, %b
   ret <32 x bfloat> %op
@@ -12980,8 +12815,8 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fdiv_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
@@ -12993,14 +12828,14 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v4, v2
@@ -13012,14 +12847,14 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX9-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX9-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -13031,14 +12866,14 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
@@ -13056,7 +12891,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fdiv bfloat %a, %b
   ret bfloat %op
@@ -13084,34 +12919,25 @@ define bfloat @v_fabs_bf16(bfloat %a) {
 ; GFX8-LABEL: v_fabs_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fabs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fabs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fabs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.fabs.bf16(bfloat %a)
   ret bfloat %op
@@ -13130,22 +12956,33 @@ define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
 ;
 ; GFX8-LABEL: s_fabs_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x7fff
+; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fabs_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fff
+; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fabs_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX10-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fabs_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX11-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %op = call bfloat @llvm.fabs.bf16(bfloat %a)
   %cast = bitcast bfloat %op to i16
@@ -13170,25 +13007,25 @@ define bfloat @v_fneg_bf16(bfloat %a) {
 ; GFX8-LABEL: v_fneg_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fneg_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fneg_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fneg bfloat %a
   ret bfloat %op
@@ -13212,7 +13049,6 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
 ;
 ; GFX8-LABEL: s_fneg_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -13221,7 +13057,6 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
 ;
 ; GFX9-LABEL: s_fneg_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -13230,7 +13065,6 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
 ;
 ; GFX10-LABEL: s_fneg_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -13238,11 +13072,9 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
 ;
 ; GFX11-LABEL: s_fneg_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %op = fneg bfloat %a
@@ -13276,43 +13108,25 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
 ; GFX8-LABEL: v_fneg_fabs_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fneg_fabs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fneg_fabs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_fabs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
   %op = fneg bfloat %fabs
@@ -13335,7 +13149,6 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
 ;
 ; GFX8-LABEL: s_fneg_fabs_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -13344,7 +13157,6 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
 ;
 ; GFX9-LABEL: s_fneg_fabs_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -13353,7 +13165,6 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
 ;
 ; GFX10-LABEL: s_fneg_fabs_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -13361,11 +13172,9 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
 ;
 ; GFX11-LABEL: s_fneg_fabs_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
@@ -13410,46 +13219,38 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_minnum_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
   ret bfloat %op
@@ -13493,68 +13294,53 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_min_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_min_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_min_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
   ret <2 x bfloat> %op
@@ -13610,91 +13396,82 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_min_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_min_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v5, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v3 :: v_dual_min_f32 v0, v0, v2
-; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_min_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
@@ -13762,113 +13539,82 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    v_min_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_min_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_min_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_min_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_min_f32_e32 v2, v7, v6
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v6, v7, v7 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_min_f32 v1, v1, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f32_e32 v4, v6, v5
-; GFX11-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT:    v_min_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %op
@@ -13984,206 +13730,138 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_min_f32_e32 v7, v9, v7
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_min_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v5, v9, v5
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    v_min_f32_e32 v4, v8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_min_f32_e32 v9, v10, v9
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_min_f32_e32 v10, v11, v10
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX9-NEXT:    v_min_f32_e32 v11, v12, v11
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_min_f32_e32 v4, v8, v4
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
-; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v2
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_max_f32_e32 v9, v11, v11
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_min_f32_e32 v9, v9, v10
-; GFX10-NEXT:    v_max_f32_e32 v10, v11, v11
-; GFX10-NEXT:    v_max_f32_e32 v11, v12, v12
-; GFX10-NEXT:    v_max_f32_e32 v12, v13, v13
-; GFX10-NEXT:    v_max_f32_e32 v13, v14, v14
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_min_f32_e32 v10, v11, v10
-; GFX10-NEXT:    v_min_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
-; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_min_f32_e32 v4, v11, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_min_f32_e32 v5, v10, v9
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_min_f32_e32 v6, v12, v11
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: v_minnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v10, v10, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_min_f32 v8, v9, v8
-; GFX11-NEXT:    v_min_f32_e32 v9, v11, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_lshlrev_b32 v6, 16, v6
-; GFX11-NEXT:    v_max_f32_e32 v10, v12, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_f32_e32 v10, v11, v10
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_and_b32 v12, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v12, v12, v12
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v3, v3, v3
-; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX11-NEXT:    v_min_f32_e32 v4, v11, v10
+; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_min_f32_e32 v5, v10, v9
+; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX11-NEXT:    v_min_f32_e32 v6, v12, v11
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f32_e32 v11, v13, v12
-; GFX11-NEXT:    v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v3, v3, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_and_b32 v4, 0xffff0000, v9
-; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_min_f32 v2, v2, v5 :: v_dual_and_b32 v5, 0xffff0000, v8
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
   ret <8 x bfloat> %op
@@ -14399,356 +14077,252 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_min_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v15, v17, v15
-; GFX8-NEXT:    v_min_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_min_f32_e32 v14, v17, v14
-; GFX8-NEXT:    v_min_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_min_f32_e32 v13, v17, v13
-; GFX8-NEXT:    v_min_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_min_f32_e32 v12, v17, v12
-; GFX8-NEXT:    v_min_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_min_f32_e32 v11, v17, v11
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_min_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v9, v17, v9
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT:    v_min_f32_e32 v8, v16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_min_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX9-NEXT:    v_min_f32_e32 v17, v18, v17
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
-; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX9-NEXT:    v_min_f32_e32 v18, v19, v18
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
-; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX9-NEXT:    v_min_f32_e32 v19, v20, v19
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX9-NEXT:    v_min_f32_e32 v20, v21, v20
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX9-NEXT:    v_min_f32_e32 v21, v22, v21
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX9-NEXT:    v_min_f32_e32 v22, v23, v22
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v23, v24, v23
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_min_f32_e32 v8, v16, v8
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v9
+; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v9
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v9
+; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v9
+; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v9
+; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v9
+; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_max_f32_e32 v17, v18, v18
-; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v0
-; GFX10-NEXT:    v_min_f32_e32 v17, v18, v17
-; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
-; GFX10-NEXT:    v_max_f32_e32 v19, v20, v20
-; GFX10-NEXT:    v_max_f32_e32 v20, v21, v21
-; GFX10-NEXT:    v_max_f32_e32 v21, v22, v22
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_min_f32_e32 v18, v19, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_min_f32_e32 v19, v21, v20
-; GFX10-NEXT:    v_max_f32_e32 v20, v22, v22
-; GFX10-NEXT:    v_max_f32_e32 v21, v23, v23
-; GFX10-NEXT:    v_max_f32_e32 v22, v24, v24
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
-; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
-; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
-; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v20, v21, v20
-; GFX10-NEXT:    v_min_f32_e32 v21, v23, v22
-; GFX10-NEXT:    v_min_f32_e32 v22, v25, v24
-; GFX10-NEXT:    v_min_f32_e32 v23, v27, v26
-; GFX10-NEXT:    v_min_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_min_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_min_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
+; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v10
+; GFX10-NEXT:    v_min_f32_e32 v9, v16, v11
 ; GFX10-NEXT:    v_min_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT:    v_min_f32_e32 v10, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v11
+; GFX10-NEXT:    v_min_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_min_f32_e32 v12, v17, v16
+; GFX10-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_and_b32 v18, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_max_f32 v16, v16, v16 :: v_dual_and_b32 v19, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v25, 0xffff0000, v1
-; GFX11-NEXT:    v_dual_max_f32 v17, v18, v18 :: v_dual_max_f32 v18, v19, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_and_b32 v26, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_min_f32 v17, v18, v17 :: v_dual_max_f32 v18, v19, v19
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v19, v20, v20 :: v_dual_max_f32 v20, v21, v21
-; GFX11-NEXT:    v_dual_max_f32 v21, v22, v22 :: v_dual_and_b32 v22, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_dual_min_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v7, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v19, v21, v20 :: v_dual_max_f32 v20, v22, v22
-; GFX11-NEXT:    v_dual_max_f32 v14, v14, v14 :: v_dual_max_f32 v21, v23, v23
-; GFX11-NEXT:    v_dual_max_f32 v22, v24, v24 :: v_dual_lshlrev_b32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v20, v21, v20 :: v_dual_max_f32 v15, v15, v15
-; GFX11-NEXT:    v_dual_max_f32 v26, v26, v26 :: v_dual_and_b32 v27, 0xffff0000, v0
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
-; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v6, v6, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v10, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_dual_min_f32 v21, v23, v22 :: v_dual_min_f32 v22, v25, v24
-; GFX11-NEXT:    v_dual_min_f32 v23, v27, v26 :: v_dual_lshlrev_b32 v12, 16, v12
-; GFX11-NEXT:    v_dual_min_f32 v6, v6, v14 :: v_dual_max_f32 v5, v5, v5
-; GFX11-NEXT:    v_dual_max_f32 v10, v10, v10 :: v_dual_max_f32 v11, v11, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_dual_min_f32 v7, v7, v15 :: v_dual_lshlrev_b32 v4, 16, v4
-; GFX11-NEXT:    v_dual_max_f32 v12, v12, v12 :: v_dual_min_f32 v5, v5, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT:    v_dual_min_f32 v0, v0, v8 :: v_dual_min_f32 v3, v3, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX11-NEXT:    v_dual_min_f32 v4, v4, v12 :: v_dual_min_f32 v1, v1, v9
+; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
+; GFX11-NEXT:    v_min_f32_e32 v9, v20, v19
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX11-NEXT:    v_min_f32_e32 v8, v18, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_dual_min_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_min_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_min_f32 v9, v16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    v_min_f32_e32 v5, v5, v11
+; GFX11-NEXT:    v_min_f32_e32 v11, v13, v12
+; GFX11-NEXT:    v_min_f32_e32 v12, v17, v16
+; GFX11-NEXT:    v_dual_min_f32 v6, v6, v14 :: v_dual_min_f32 v7, v7, v15
+; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
   ret <16 x bfloat> %op
@@ -15284,703 +14858,484 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
-; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v16
 ; GFX8-NEXT:    v_min_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
-; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX8-NEXT:    v_min_f32_e32 v30, v32, v30
-; GFX8-NEXT:    v_min_f32_e32 v13, v13, v29
-; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX8-NEXT:    v_min_f32_e32 v29, v32, v29
-; GFX8-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
-; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_min_f32_e32 v28, v32, v28
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX8-NEXT:    v_min_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
-; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_min_f32_e32 v27, v27, v33
-; GFX8-NEXT:    v_min_f32_e32 v15, v15, v32
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX8-NEXT:    v_min_f32_e32 v32, v33, v32
-; GFX8-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_min_f32_e32 v26, v33, v26
-; GFX8-NEXT:    v_min_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
-; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX8-NEXT:    v_min_f32_e32 v25, v33, v25
-; GFX8-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-NEXT:    v_min_f32_e32 v24, v33, v24
-; GFX8-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v23, v33, v23
-; GFX8-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_min_f32_e32 v22, v33, v22
-; GFX8-NEXT:    v_min_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_min_f32_e32 v21, v33, v21
-; GFX8-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_min_f32_e32 v20, v33, v20
-; GFX8-NEXT:    v_min_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_min_f32_e32 v19, v33, v19
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_min_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v17, v33, v17
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX8-NEXT:    v_perm_b32 v10, v10, v32, s4
-; GFX8-NEXT:    v_perm_b32 v15, v15, v27, s4
+; GFX8-NEXT:    v_min_f32_e32 v16, v31, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v17
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_min_f32_e32 v7, v7, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_min_f32_e32 v8, v8, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_min_f32_e32 v9, v9, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_min_f32_e32 v10, v10, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_min_f32_e32 v11, v11, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_min_f32_e32 v12, v12, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_min_f32_e32 v13, v13, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_min_f32_e32 v14, v14, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_min_f32_e32 v15, v15, v17
+; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX9-NEXT:    v_max_f32_e32 v31, v31, v31
-; GFX9-NEXT:    v_max_f32_e32 v32, v32, v32
-; GFX9-NEXT:    v_max_f32_e32 v33, v33, v33
-; GFX9-NEXT:    v_max_f32_e32 v34, v34, v34
-; GFX9-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX9-NEXT:    v_max_f32_e32 v37, v37, v37
-; GFX9-NEXT:    v_max_f32_e32 v50, v50, v50
-; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_min_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_min_f32_e32 v32, v34, v33
-; GFX9-NEXT:    v_min_f32_e32 v33, v37, v36
-; GFX9-NEXT:    v_min_f32_e32 v37, v51, v50
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v5
-; GFX9-NEXT:    v_max_f32_e32 v38, v38, v38
-; GFX9-NEXT:    v_max_f32_e32 v39, v39, v39
-; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX9-NEXT:    v_max_f32_e32 v50, v43, v43
-; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX9-NEXT:    v_min_f32_e32 v34, v39, v38
-; GFX9-NEXT:    v_min_f32_e32 v38, v53, v52
-; GFX9-NEXT:    v_min_f32_e32 v50, v51, v50
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v4
-; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX9-NEXT:    v_min_f32_e32 v51, v52, v51
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX9-NEXT:    v_min_f32_e32 v39, v55, v54
-; GFX9-NEXT:    v_min_f32_e32 v52, v53, v52
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
-; GFX9-NEXT:    v_min_f32_e32 v53, v54, v53
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v48, v48, v48
-; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
-; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX9-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v15
-; GFX9-NEXT:    v_min_f32_e32 v36, v49, v48
-; GFX9-NEXT:    v_min_f32_e32 v48, v41, v40
-; GFX9-NEXT:    v_min_f32_e32 v54, v55, v54
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX9-NEXT:    v_min_f32_e32 v55, v40, v55
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v35
-; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
-; GFX9-NEXT:    v_min_f32_e32 v49, v42, v49
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v35, v35, v35
-; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX9-NEXT:    v_max_f32_e32 v30, v30, v30
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX9-NEXT:    v_max_f32_e32 v29, v29, v29
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX9-NEXT:    v_max_f32_e32 v28, v28, v28
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX9-NEXT:    v_max_f32_e32 v27, v27, v27
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_max_f32_e32 v26, v26, v26
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v25, v25, v25
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v15, v15, v35
-; GFX9-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_min_f32_e32 v13, v13, v29
-; GFX9-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_min_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_min_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v55, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v54, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v53, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v52, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v51, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v50, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v48, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v39, s4
-; GFX9-NEXT:    v_perm_b32 v8, v8, v38, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v37, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v36, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v34, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v33, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v32, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT:    v_perm_b32 v15, v15, v49, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_min_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v17
+; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v17
+; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v17
+; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v17
+; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v17
+; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v17
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_min_f32_e32 v8, v8, v17
+; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_min_f32_e32 v9, v9, v17
+; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_min_f32_e32 v10, v10, v17
+; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_min_f32_e32 v11, v11, v17
+; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_min_f32_e32 v12, v12, v17
+; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_min_f32_e32 v13, v13, v17
+; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_min_f32_e32 v14, v14, v17
+; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_min_f32_e32 v15, v15, v17
+; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX10-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX10-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
-; GFX10-NEXT:    v_max_f32_e32 v65, v65, v65
-; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
-; GFX10-NEXT:    v_max_f32_e32 v67, v67, v67
-; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
-; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX10-NEXT:    v_min_f32_e32 v53, v54, v53
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
-; GFX10-NEXT:    v_min_f32_e32 v55, v64, v55
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
-; GFX10-NEXT:    v_min_f32_e32 v65, v66, v65
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
-; GFX10-NEXT:    v_min_f32_e32 v67, v68, v67
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v32, v32, v32
-; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
-; GFX10-NEXT:    v_max_f32_e32 v35, v35, v35
-; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX10-NEXT:    v_max_f32_e32 v37, v37, v37
-; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
-; GFX10-NEXT:    v_max_f32_e32 v39, v39, v39
-; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
-; GFX10-NEXT:    v_max_f32_e32 v49, v49, v49
-; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
-; GFX10-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
-; GFX10-NEXT:    v_min_f32_e32 v32, v34, v32
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
-; GFX10-NEXT:    v_min_f32_e32 v35, v36, v35
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v37, v38, v37
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
-; GFX10-NEXT:    v_min_f32_e32 v39, v48, v39
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
-; GFX10-NEXT:    v_min_f32_e32 v49, v50, v49
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
-; GFX10-NEXT:    v_min_f32_e32 v51, v52, v51
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_min_f32_e32 v21, v53, v52
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_min_f32_e32 v22, v55, v54
+; GFX10-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX10-NEXT:    v_min_f32_e32 v32, v33, v32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX10-NEXT:    v_min_f32_e32 v34, v35, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_max_f32_e32 v33, v33, v33
-; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
-; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
-; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
-; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
-; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
-; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
-; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
-; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX10-NEXT:    v_max_f32_e32 v30, v30, v30
-; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX10-NEXT:    v_max_f32_e32 v29, v29, v29
-; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX10-NEXT:    v_max_f32_e32 v28, v28, v28
-; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
-; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_min_f32_e32 v34, v36, v34
-; GFX10-NEXT:    v_min_f32_e32 v36, v48, v38
-; GFX10-NEXT:    v_min_f32_e32 v38, v52, v50
-; GFX10-NEXT:    v_min_f32_e32 v48, v64, v54
-; GFX10-NEXT:    v_min_f32_e32 v50, v68, v66
-; GFX10-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_min_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_min_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_min_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
+; GFX10-NEXT:    v_min_f32_e32 v36, v37, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX10-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
+; GFX10-NEXT:    v_min_f32_e32 v38, v39, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
+; GFX10-NEXT:    v_min_f32_e32 v48, v49, v48
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v14, v14, v32, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX10-NEXT:    v_min_f32_e32 v50, v51, v50
+; GFX10-NEXT:    v_min_f32_e32 v23, v65, v64
+; GFX10-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_min_f32_e32 v24, v67, v66
+; GFX10-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_min_f32_e32 v25, v33, v68
+; GFX10-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_min_f32_e32 v16, v35, v16
+; GFX10-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_min_f32_e32 v17, v37, v17
+; GFX10-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_min_f32_e32 v18, v39, v18
+; GFX10-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_min_f32_e32 v19, v49, v19
+; GFX10-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX10-NEXT:    v_min_f32_e32 v16, v33, v16
-; GFX10-NEXT:    v_min_f32_e32 v15, v15, v17
-; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
+; GFX10-NEXT:    v_min_f32_e32 v20, v20, v21
+; GFX10-NEXT:    v_min_f32_e32 v15, v15, v22
+; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX11-NEXT:    v_dual_max_f32 v33, v33, v33 :: v_dual_and_b32 v32, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
-; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_dual_max_f32 v35, v35, v35 :: v_dual_max_f32 v34, v34, v34
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_dual_max_f32 v38, v38, v38 :: v_dual_max_f32 v37, v37, v37
-; GFX11-NEXT:    v_dual_max_f32 v39, v39, v39 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX11-NEXT:    v_dual_max_f32 v65, v65, v65 :: v_dual_and_b32 v64, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
-; GFX11-NEXT:    v_dual_max_f32 v83, v83, v83 :: v_dual_and_b32 v82, 0xffff0000, v2
-; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v18, 16, v18
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    v_dual_max_f32 v49, v49, v49 :: v_dual_max_f32 v48, v48, v48
-; GFX11-NEXT:    v_dual_max_f32 v51, v51, v51 :: v_dual_max_f32 v50, v50, v50
-; GFX11-NEXT:    v_dual_max_f32 v54, v54, v54 :: v_dual_max_f32 v53, v53, v53
-; GFX11-NEXT:    v_dual_max_f32 v67, v67, v67 :: v_dual_max_f32 v66, v66, v66
-; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_max_f32 v26, v26, v26
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v10, v10, v10
-; GFX11-NEXT:    v_dual_max_f32 v21, v21, v21 :: v_dual_max_f32 v22, v22, v22
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v6, v6, v6
-; GFX11-NEXT:    v_dual_min_f32 v33, v34, v33 :: v_dual_max_f32 v16, v16, v16
-; GFX11-NEXT:    v_dual_min_f32 v34, v36, v35 :: v_dual_min_f32 v35, v38, v37
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX11-NEXT:    v_dual_max_f32 v81, v81, v81 :: v_dual_and_b32 v80, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_dual_max_f32 v70, v70, v70 :: v_dual_max_f32 v69, v69, v69
-; GFX11-NEXT:    v_dual_min_f32 v36, v48, v39 :: v_dual_min_f32 v37, v50, v49
-; GFX11-NEXT:    v_min_f32_e32 v39, v54, v53
-; GFX11-NEXT:    v_dual_min_f32 v10, v10, v26 :: v_dual_min_f32 v1, v1, v17
-; GFX11-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_dual_max_f32 v32, v32, v32 :: v_dual_max_f32 v55, v55, v55
-; GFX11-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX11-NEXT:    v_dual_max_f32 v64, v64, v64 :: v_dual_max_f32 v71, v71, v71
-; GFX11-NEXT:    v_max_f32_e32 v68, v68, v68
-; GFX11-NEXT:    v_max_f32_e32 v80, v80, v80
-; GFX11-NEXT:    v_max_f32_e32 v82, v82, v82
-; GFX11-NEXT:    v_dual_max_f32 v86, v86, v86 :: v_dual_max_f32 v85, v85, v85
-; GFX11-NEXT:    v_dual_max_f32 v15, v15, v15 :: v_dual_max_f32 v84, v84, v84
-; GFX11-NEXT:    v_dual_max_f32 v29, v29, v29 :: v_dual_max_f32 v30, v30, v30
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_max_f32 v14, v14, v14
-; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v28, v28, v28
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v12, v12, v12
-; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v8, v8
-; GFX11-NEXT:    v_dual_max_f32 v19, v19, v19 :: v_dual_max_f32 v20, v20, v20
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    v_dual_min_f32 v38, v52, v51 :: v_dual_min_f32 v53, v82, v81
-; GFX11-NEXT:    v_dual_min_f32 v48, v64, v55 :: v_dual_min_f32 v55, v86, v85
-; GFX11-NEXT:    v_dual_min_f32 v49, v66, v65 :: v_dual_min_f32 v50, v68, v67
-; GFX11-NEXT:    v_min_f32_e32 v13, v13, v29
-; GFX11-NEXT:    v_dual_min_f32 v51, v70, v69 :: v_dual_min_f32 v52, v80, v71
-; GFX11-NEXT:    v_dual_min_f32 v9, v9, v25 :: v_dual_min_f32 v54, v84, v83
-; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_min_f32 v14, v14, v30
-; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_min_f32 v12, v12, v28
-; GFX11-NEXT:    v_dual_min_f32 v7, v7, v23 :: v_dual_min_f32 v8, v8, v24
-; GFX11-NEXT:    v_dual_min_f32 v3, v3, v19 :: v_dual_min_f32 v4, v4, v20
-; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_min_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v17, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_min_f32 v2, v2, v18
-; GFX11-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX11-NEXT:    v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_dual_min_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
+; GFX11-NEXT:    v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX11-NEXT:    v_dual_min_f32 v8, v8, v24 :: v_dual_min_f32 v9, v9, v25
+; GFX11-NEXT:    v_min_f32_e32 v25, v69, v68
+; GFX11-NEXT:    v_dual_min_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
+; GFX11-NEXT:    v_min_f32_e32 v27, v81, v80
+; GFX11-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX11-NEXT:    v_dual_min_f32 v28, v83, v82 :: v_dual_min_f32 v29, v85, v84
+; GFX11-NEXT:    v_dual_min_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_min_f32_e32 v22, v55, v54
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_dual_min_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_min_f32_e32 v23, v65, v64
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_dual_min_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
+; GFX11-NEXT:    v_min_f32_e32 v18, v39, v38
+; GFX11-NEXT:    v_dual_min_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_min_f32_e32 v19, v49, v48
+; GFX11-NEXT:    v_min_f32_e32 v17, v37, v36
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_min_f32_e32 v21, v53, v52
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_min_f32_e32 v16, v35, v34
+; GFX11-NEXT:    v_min_f32_e32 v32, v33, v32
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
 ; GFX11-NEXT:    v_min_f32_e32 v15, v15, v17
-; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v16, v32, v16
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
   ret <32 x bfloat> %op
@@ -16021,46 +15376,38 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_maxnum_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
   ret bfloat %op
@@ -16104,68 +15451,53 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_max_f32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_max_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_max_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
   ret <2 x bfloat> %op
@@ -16221,91 +15553,82 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_max_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v5, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v3 :: v_dual_max_f32 v0, v0, v2
-; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_max_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
@@ -16373,113 +15696,82 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    v_max_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_max_f32_e32 v2, v4, v2
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_max_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_max_f32_e32 v2, v7, v6
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v6, v7, v7 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f32_e32 v4, v6, v5
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT:    v_max_f32_e32 v2, v7, v6
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %op
@@ -16595,206 +15887,138 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_max_f32_e32 v7, v9, v7
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_max_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v5, v9, v5
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    v_max_f32_e32 v4, v8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v9, v10, v9
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_max_f32_e32 v10, v11, v10
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX9-NEXT:    v_max_f32_e32 v11, v12, v11
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v4, v8, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
-; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v2
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_max_f32_e32 v9, v11, v11
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v10
-; GFX10-NEXT:    v_max_f32_e32 v10, v11, v11
-; GFX10-NEXT:    v_max_f32_e32 v11, v12, v12
-; GFX10-NEXT:    v_max_f32_e32 v12, v13, v13
-; GFX10-NEXT:    v_max_f32_e32 v13, v14, v14
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v10, v11, v10
-; GFX10-NEXT:    v_max_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v11, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_max_f32_e32 v5, v10, v9
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
-; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_max_f32_e32 v6, v12, v11
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v10, v10, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v9, v8
-; GFX11-NEXT:    v_max_f32_e32 v9, v11, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_lshlrev_b32 v6, 16, v6
-; GFX11-NEXT:    v_max_f32_e32 v10, v12, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_max_f32_e32 v10, v11, v10
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_and_b32 v12, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v12, v12, v12
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v3, v3, v3
-; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX11-NEXT:    v_max_f32_e32 v4, v11, v10
+; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_max_f32_e32 v5, v10, v9
+; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX11-NEXT:    v_max_f32_e32 v6, v12, v11
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f32_e32 v11, v13, v12
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v3, v3, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_and_b32 v4, 0xffff0000, v9
-; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v2, v2, v5 :: v_dual_and_b32 v5, 0xffff0000, v8
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
   ret <8 x bfloat> %op
@@ -17010,356 +16234,252 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_max_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v15, v17, v15
-; GFX8-NEXT:    v_max_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_max_f32_e32 v14, v17, v14
-; GFX8-NEXT:    v_max_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_max_f32_e32 v13, v17, v13
-; GFX8-NEXT:    v_max_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_max_f32_e32 v12, v17, v12
-; GFX8-NEXT:    v_max_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_max_f32_e32 v11, v17, v11
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_max_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v9, v17, v9
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT:    v_max_f32_e32 v8, v16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_max_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX9-NEXT:    v_max_f32_e32 v17, v18, v17
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
-; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX9-NEXT:    v_max_f32_e32 v18, v19, v18
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
-; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX9-NEXT:    v_max_f32_e32 v19, v20, v19
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX9-NEXT:    v_max_f32_e32 v20, v21, v20
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX9-NEXT:    v_max_f32_e32 v21, v22, v21
-; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX9-NEXT:    v_max_f32_e32 v22, v23, v22
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v23, v24, v23
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v8, v16, v8
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v9
+; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v9
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v9
+; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v9
+; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v9
+; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v9
+; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_max_f32_e32 v17, v18, v18
-; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v0
-; GFX10-NEXT:    v_max_f32_e32 v17, v18, v17
-; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
-; GFX10-NEXT:    v_max_f32_e32 v19, v20, v20
-; GFX10-NEXT:    v_max_f32_e32 v20, v21, v21
-; GFX10-NEXT:    v_max_f32_e32 v21, v22, v22
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_max_f32_e32 v18, v19, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_max_f32_e32 v19, v21, v20
-; GFX10-NEXT:    v_max_f32_e32 v20, v22, v22
-; GFX10-NEXT:    v_max_f32_e32 v21, v23, v23
-; GFX10-NEXT:    v_max_f32_e32 v22, v24, v24
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
-; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
-; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
-; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v20, v21, v20
-; GFX10-NEXT:    v_max_f32_e32 v21, v23, v22
-; GFX10-NEXT:    v_max_f32_e32 v22, v25, v24
-; GFX10-NEXT:    v_max_f32_e32 v23, v27, v26
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_max_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_max_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
+; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v10
+; GFX10-NEXT:    v_max_f32_e32 v9, v16, v11
 ; GFX10-NEXT:    v_max_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT:    v_max_f32_e32 v10, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v11
+; GFX10-NEXT:    v_max_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_max_f32_e32 v12, v17, v16
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_and_b32 v18, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_max_f32 v16, v16, v16 :: v_dual_and_b32 v19, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v25, 0xffff0000, v1
-; GFX11-NEXT:    v_dual_max_f32 v17, v18, v18 :: v_dual_max_f32 v18, v19, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_and_b32 v26, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_max_f32 v17, v18, v17 :: v_dual_max_f32 v18, v19, v19
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v19, v20, v20 :: v_dual_max_f32 v20, v21, v21
-; GFX11-NEXT:    v_dual_max_f32 v21, v22, v22 :: v_dual_and_b32 v22, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_dual_max_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v7, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v19, v21, v20 :: v_dual_max_f32 v20, v22, v22
-; GFX11-NEXT:    v_dual_max_f32 v14, v14, v14 :: v_dual_max_f32 v21, v23, v23
-; GFX11-NEXT:    v_dual_max_f32 v22, v24, v24 :: v_dual_lshlrev_b32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v20, v21, v20 :: v_dual_max_f32 v15, v15, v15
-; GFX11-NEXT:    v_dual_max_f32 v26, v26, v26 :: v_dual_and_b32 v27, 0xffff0000, v0
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
-; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v6, v6, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v10, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_dual_max_f32 v21, v23, v22 :: v_dual_max_f32 v22, v25, v24
-; GFX11-NEXT:    v_dual_max_f32 v23, v27, v26 :: v_dual_lshlrev_b32 v12, 16, v12
-; GFX11-NEXT:    v_dual_max_f32 v6, v6, v14 :: v_dual_max_f32 v5, v5, v5
-; GFX11-NEXT:    v_dual_max_f32 v10, v10, v10 :: v_dual_max_f32 v11, v11, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v15 :: v_dual_lshlrev_b32 v4, 16, v4
-; GFX11-NEXT:    v_dual_max_f32 v12, v12, v12 :: v_dual_max_f32 v5, v5, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v8 :: v_dual_max_f32 v3, v3, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v12 :: v_dual_max_f32 v1, v1, v9
+; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
+; GFX11-NEXT:    v_max_f32_e32 v9, v20, v19
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX11-NEXT:    v_max_f32_e32 v8, v18, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_dual_max_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_max_f32 v9, v16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v11
+; GFX11-NEXT:    v_max_f32_e32 v11, v13, v12
+; GFX11-NEXT:    v_max_f32_e32 v12, v17, v16
+; GFX11-NEXT:    v_dual_max_f32 v6, v6, v14 :: v_dual_max_f32 v7, v7, v15
+; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
   ret <16 x bfloat> %op
@@ -17895,703 +17015,484 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
-; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v16
 ; GFX8-NEXT:    v_max_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
-; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX8-NEXT:    v_max_f32_e32 v30, v32, v30
-; GFX8-NEXT:    v_max_f32_e32 v13, v13, v29
-; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX8-NEXT:    v_max_f32_e32 v29, v32, v29
-; GFX8-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
-; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_max_f32_e32 v28, v32, v28
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX8-NEXT:    v_max_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
-; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
-; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
-; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_max_f32_e32 v27, v27, v33
-; GFX8-NEXT:    v_max_f32_e32 v15, v15, v32
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX8-NEXT:    v_max_f32_e32 v32, v33, v32
-; GFX8-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX8-NEXT:    v_max_f32_e32 v26, v33, v26
-; GFX8-NEXT:    v_max_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
-; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX8-NEXT:    v_max_f32_e32 v25, v33, v25
-; GFX8-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-NEXT:    v_max_f32_e32 v24, v33, v24
-; GFX8-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
-; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v23, v33, v23
-; GFX8-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
-; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-NEXT:    v_max_f32_e32 v22, v33, v22
-; GFX8-NEXT:    v_max_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-NEXT:    v_max_f32_e32 v21, v33, v21
-; GFX8-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT:    v_max_f32_e32 v20, v33, v20
-; GFX8-NEXT:    v_max_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_max_f32_e32 v19, v33, v19
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT:    v_max_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v17, v33, v17
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
-; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
-; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
-; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
-; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
-; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
-; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
-; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
-; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
-; GFX8-NEXT:    v_perm_b32 v10, v10, v32, s4
-; GFX8-NEXT:    v_perm_b32 v15, v15, v27, s4
+; GFX8-NEXT:    v_max_f32_e32 v16, v31, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v17
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_max_f32_e32 v7, v7, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_max_f32_e32 v8, v8, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_max_f32_e32 v9, v9, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_max_f32_e32 v10, v10, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_max_f32_e32 v11, v11, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_max_f32_e32 v12, v12, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_max_f32_e32 v13, v13, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_max_f32_e32 v14, v14, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_max_f32_e32 v15, v15, v17
+; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
-; GFX9-NEXT:    v_max_f32_e32 v31, v31, v31
-; GFX9-NEXT:    v_max_f32_e32 v32, v32, v32
-; GFX9-NEXT:    v_max_f32_e32 v33, v33, v33
-; GFX9-NEXT:    v_max_f32_e32 v34, v34, v34
-; GFX9-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX9-NEXT:    v_max_f32_e32 v37, v37, v37
-; GFX9-NEXT:    v_max_f32_e32 v50, v50, v50
-; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_max_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_max_f32_e32 v32, v34, v33
-; GFX9-NEXT:    v_max_f32_e32 v33, v37, v36
-; GFX9-NEXT:    v_max_f32_e32 v37, v51, v50
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v5
-; GFX9-NEXT:    v_max_f32_e32 v38, v38, v38
-; GFX9-NEXT:    v_max_f32_e32 v39, v39, v39
-; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX9-NEXT:    v_max_f32_e32 v50, v43, v43
-; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX9-NEXT:    v_max_f32_e32 v34, v39, v38
-; GFX9-NEXT:    v_max_f32_e32 v38, v53, v52
-; GFX9-NEXT:    v_max_f32_e32 v50, v51, v50
-; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v4
-; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
-; GFX9-NEXT:    v_max_f32_e32 v51, v52, v51
-; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX9-NEXT:    v_max_f32_e32 v39, v55, v54
-; GFX9-NEXT:    v_max_f32_e32 v52, v53, v52
-; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
-; GFX9-NEXT:    v_max_f32_e32 v53, v54, v53
-; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v48, v48, v48
-; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
-; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX9-NEXT:    v_max_f32_e32 v41, v41, v41
-; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v15
-; GFX9-NEXT:    v_max_f32_e32 v36, v49, v48
-; GFX9-NEXT:    v_max_f32_e32 v48, v41, v40
-; GFX9-NEXT:    v_max_f32_e32 v54, v55, v54
-; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v42, v42, v42
-; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
-; GFX9-NEXT:    v_max_f32_e32 v55, v40, v55
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v35
-; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
-; GFX9-NEXT:    v_max_f32_e32 v49, v42, v49
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v35, v35, v35
-; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX9-NEXT:    v_max_f32_e32 v30, v30, v30
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX9-NEXT:    v_max_f32_e32 v29, v29, v29
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX9-NEXT:    v_max_f32_e32 v28, v28, v28
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX9-NEXT:    v_max_f32_e32 v27, v27, v27
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX9-NEXT:    v_max_f32_e32 v26, v26, v26
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v25, v25, v25
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v15, v15, v35
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v29
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v55, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v54, s4
-; GFX9-NEXT:    v_perm_b32 v2, v2, v53, s4
-; GFX9-NEXT:    v_perm_b32 v3, v3, v52, s4
-; GFX9-NEXT:    v_perm_b32 v4, v4, v51, s4
-; GFX9-NEXT:    v_perm_b32 v5, v5, v50, s4
-; GFX9-NEXT:    v_perm_b32 v6, v6, v48, s4
-; GFX9-NEXT:    v_perm_b32 v7, v7, v39, s4
-; GFX9-NEXT:    v_perm_b32 v8, v8, v38, s4
-; GFX9-NEXT:    v_perm_b32 v9, v9, v37, s4
-; GFX9-NEXT:    v_perm_b32 v10, v10, v36, s4
-; GFX9-NEXT:    v_perm_b32 v11, v11, v34, s4
-; GFX9-NEXT:    v_perm_b32 v12, v12, v33, s4
-; GFX9-NEXT:    v_perm_b32 v13, v13, v32, s4
-; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
-; GFX9-NEXT:    v_perm_b32 v15, v15, v49, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v17
+; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v17
+; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v17
+; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v17
+; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v17
+; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v17
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v17
+; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v17
+; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v17
+; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v17
+; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v17
+; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v17
+; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v17
+; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v17
+; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX10-NEXT:    v_max_f32_e32 v53, v53, v53
-; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX10-NEXT:    v_max_f32_e32 v55, v55, v55
-; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
-; GFX10-NEXT:    v_max_f32_e32 v65, v65, v65
-; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
-; GFX10-NEXT:    v_max_f32_e32 v67, v67, v67
-; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
-; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX10-NEXT:    v_max_f32_e32 v53, v54, v53
-; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
-; GFX10-NEXT:    v_max_f32_e32 v55, v64, v55
-; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
-; GFX10-NEXT:    v_max_f32_e32 v65, v66, v65
-; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
-; GFX10-NEXT:    v_max_f32_e32 v67, v68, v67
-; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v32, v32, v32
-; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
-; GFX10-NEXT:    v_max_f32_e32 v35, v35, v35
-; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX10-NEXT:    v_max_f32_e32 v37, v37, v37
-; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
-; GFX10-NEXT:    v_max_f32_e32 v39, v39, v39
-; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
-; GFX10-NEXT:    v_max_f32_e32 v49, v49, v49
-; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
-; GFX10-NEXT:    v_max_f32_e32 v51, v51, v51
-; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
-; GFX10-NEXT:    v_max_f32_e32 v32, v34, v32
-; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
-; GFX10-NEXT:    v_max_f32_e32 v35, v36, v35
-; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v37, v38, v37
-; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
-; GFX10-NEXT:    v_max_f32_e32 v39, v48, v39
-; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
-; GFX10-NEXT:    v_max_f32_e32 v49, v50, v49
-; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
-; GFX10-NEXT:    v_max_f32_e32 v51, v52, v51
-; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_max_f32_e32 v21, v53, v52
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_max_f32_e32 v22, v55, v54
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX10-NEXT:    v_max_f32_e32 v32, v33, v32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
+; GFX10-NEXT:    v_max_f32_e32 v34, v35, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_max_f32_e32 v33, v33, v33
-; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
-; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
-; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
-; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
-; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
-; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
-; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
-; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
-; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
-; GFX10-NEXT:    v_max_f32_e32 v30, v30, v30
-; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
-; GFX10-NEXT:    v_max_f32_e32 v29, v29, v29
-; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
-; GFX10-NEXT:    v_max_f32_e32 v28, v28, v28
-; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
-; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
-; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
-; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
-; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
-; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
-; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX10-NEXT:    v_max_f32_e32 v22, v22, v22
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v21, v21, v21
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v20, v20, v20
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v19, v19, v19
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v34, v36, v34
-; GFX10-NEXT:    v_max_f32_e32 v36, v48, v38
-; GFX10-NEXT:    v_max_f32_e32 v38, v52, v50
-; GFX10-NEXT:    v_max_f32_e32 v48, v64, v54
-; GFX10-NEXT:    v_max_f32_e32 v50, v68, v66
-; GFX10-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_max_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_max_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
+; GFX10-NEXT:    v_max_f32_e32 v36, v37, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
+; GFX10-NEXT:    v_max_f32_e32 v38, v39, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
+; GFX10-NEXT:    v_max_f32_e32 v48, v49, v48
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v14, v14, v32, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX10-NEXT:    v_max_f32_e32 v50, v51, v50
+; GFX10-NEXT:    v_max_f32_e32 v23, v65, v64
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_max_f32_e32 v24, v67, v66
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_max_f32_e32 v25, v33, v68
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_max_f32_e32 v16, v35, v16
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_max_f32_e32 v17, v37, v17
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_max_f32_e32 v18, v39, v18
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_max_f32_e32 v19, v49, v19
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
-; GFX10-NEXT:    v_max_f32_e32 v16, v33, v16
-; GFX10-NEXT:    v_max_f32_e32 v15, v15, v17
-; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
+; GFX10-NEXT:    v_max_f32_e32 v20, v20, v21
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v22
+; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
-; GFX11-NEXT:    v_dual_max_f32 v33, v33, v33 :: v_dual_and_b32 v32, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
-; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
-; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
-; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_dual_max_f32 v35, v35, v35 :: v_dual_max_f32 v34, v34, v34
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_dual_max_f32 v38, v38, v38 :: v_dual_max_f32 v37, v37, v37
-; GFX11-NEXT:    v_dual_max_f32 v39, v39, v39 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_max_f32_e32 v36, v36, v36
-; GFX11-NEXT:    v_dual_max_f32 v65, v65, v65 :: v_dual_and_b32 v64, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
-; GFX11-NEXT:    v_dual_max_f32 v83, v83, v83 :: v_dual_and_b32 v82, 0xffff0000, v2
-; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v18, 16, v18
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    v_dual_max_f32 v49, v49, v49 :: v_dual_max_f32 v48, v48, v48
-; GFX11-NEXT:    v_dual_max_f32 v51, v51, v51 :: v_dual_max_f32 v50, v50, v50
-; GFX11-NEXT:    v_dual_max_f32 v54, v54, v54 :: v_dual_max_f32 v53, v53, v53
-; GFX11-NEXT:    v_dual_max_f32 v67, v67, v67 :: v_dual_max_f32 v66, v66, v66
-; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_max_f32 v26, v26, v26
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v10, v10, v10
-; GFX11-NEXT:    v_dual_max_f32 v21, v21, v21 :: v_dual_max_f32 v22, v22, v22
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v6, v6, v6
-; GFX11-NEXT:    v_dual_max_f32 v33, v34, v33 :: v_dual_max_f32 v16, v16, v16
-; GFX11-NEXT:    v_dual_max_f32 v34, v36, v35 :: v_dual_max_f32 v35, v38, v37
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX11-NEXT:    v_dual_max_f32 v81, v81, v81 :: v_dual_and_b32 v80, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_dual_max_f32 v70, v70, v70 :: v_dual_max_f32 v69, v69, v69
-; GFX11-NEXT:    v_dual_max_f32 v36, v48, v39 :: v_dual_max_f32 v37, v50, v49
-; GFX11-NEXT:    v_max_f32_e32 v39, v54, v53
-; GFX11-NEXT:    v_dual_max_f32 v10, v10, v26 :: v_dual_max_f32 v1, v1, v17
-; GFX11-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_dual_max_f32 v32, v32, v32 :: v_dual_max_f32 v55, v55, v55
-; GFX11-NEXT:    v_max_f32_e32 v52, v52, v52
-; GFX11-NEXT:    v_dual_max_f32 v64, v64, v64 :: v_dual_max_f32 v71, v71, v71
-; GFX11-NEXT:    v_max_f32_e32 v68, v68, v68
-; GFX11-NEXT:    v_max_f32_e32 v80, v80, v80
-; GFX11-NEXT:    v_max_f32_e32 v82, v82, v82
-; GFX11-NEXT:    v_dual_max_f32 v86, v86, v86 :: v_dual_max_f32 v85, v85, v85
-; GFX11-NEXT:    v_dual_max_f32 v15, v15, v15 :: v_dual_max_f32 v84, v84, v84
-; GFX11-NEXT:    v_dual_max_f32 v29, v29, v29 :: v_dual_max_f32 v30, v30, v30
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_max_f32 v14, v14, v14
-; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v28, v28, v28
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v12, v12, v12
-; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v8, v8
-; GFX11-NEXT:    v_dual_max_f32 v19, v19, v19 :: v_dual_max_f32 v20, v20, v20
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v18, v18, v18
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    v_dual_max_f32 v38, v52, v51 :: v_dual_max_f32 v53, v82, v81
-; GFX11-NEXT:    v_dual_max_f32 v48, v64, v55 :: v_dual_max_f32 v55, v86, v85
-; GFX11-NEXT:    v_dual_max_f32 v49, v66, v65 :: v_dual_max_f32 v50, v68, v67
-; GFX11-NEXT:    v_max_f32_e32 v13, v13, v29
-; GFX11-NEXT:    v_dual_max_f32 v51, v70, v69 :: v_dual_max_f32 v52, v80, v71
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v25 :: v_dual_max_f32 v54, v84, v83
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_max_f32 v14, v14, v30
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_max_f32 v12, v12, v28
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v23 :: v_dual_max_f32 v8, v8, v24
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v19 :: v_dual_max_f32 v4, v4, v20
-; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v17, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
-; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_max_f32 v2, v2, v18
-; GFX11-NEXT:    v_max_f32_e32 v16, v16, v16
-; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
+; GFX11-NEXT:    v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_dual_max_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX11-NEXT:    v_dual_max_f32 v8, v8, v24 :: v_dual_max_f32 v9, v9, v25
+; GFX11-NEXT:    v_max_f32_e32 v25, v69, v68
+; GFX11-NEXT:    v_dual_max_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
+; GFX11-NEXT:    v_max_f32_e32 v27, v81, v80
+; GFX11-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX11-NEXT:    v_dual_max_f32 v28, v83, v82 :: v_dual_max_f32 v29, v85, v84
+; GFX11-NEXT:    v_dual_max_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_max_f32_e32 v22, v55, v54
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_max_f32_e32 v23, v65, v64
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_dual_max_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
+; GFX11-NEXT:    v_max_f32_e32 v18, v39, v38
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_max_f32_e32 v19, v49, v48
+; GFX11-NEXT:    v_max_f32_e32 v17, v37, v36
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_max_f32_e32 v21, v53, v52
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_max_f32_e32 v16, v35, v34
+; GFX11-NEXT:    v_max_f32_e32 v32, v33, v32
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
 ; GFX11-NEXT:    v_max_f32_e32 v15, v15, v17
-; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v16, v32, v16
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
   ret <32 x bfloat> %op
@@ -18653,7 +17554,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX8-LABEL: v_sqrt_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0xf800000
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -18672,13 +17573,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GFX8-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sqrt_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0xf800000
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -18697,13 +17598,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GFX9-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sqrt_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -18720,13 +17621,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sqrt_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
@@ -18751,7 +17652,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.sqrt.bf16(bfloat %a)
   ret bfloat %op
@@ -18779,34 +17680,34 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX8-LABEL: v_ldexp_bf16_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ldexp_bf16_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ldexp_bf16_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ldexp_bf16_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b)
   ret bfloat %op
@@ -18840,28 +17741,28 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX8-LABEL: v_frexp_bf16_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_frexp_bf16_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
-; GFX9-NEXT:    v_frexp_mant_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v0, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_frexp_bf16_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
   ret { bfloat, i16 } %op
@@ -18929,7 +17830,7 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX8-LABEL: v_log_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -18951,13 +17852,13 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_log_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -18976,13 +17877,13 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_log_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -18995,13 +17896,13 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_log_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
@@ -19020,7 +17921,7 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.log.bf16(bfloat %a)
   ret bfloat %op
@@ -19062,7 +17963,7 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX8-LABEL: v_log2_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -19072,52 +17973,52 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_log2_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4f800000
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_log2_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_log2_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.log2.bf16(bfloat %a)
   ret bfloat %op
@@ -19180,7 +18081,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX8-LABEL: v_log10_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -19202,13 +18103,13 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_log10_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -19227,13 +18128,13 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_log10_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -19246,13 +18147,13 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_log10_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
@@ -19271,7 +18172,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.log10.bf16(bfloat %a)
   ret bfloat %op
@@ -19337,7 +18238,7 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX8-LABEL: v_exp_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v3, v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3fb8a000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v3
@@ -19358,23 +18259,23 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_exp_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX9-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
+; GFX9-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
+; GFX9-NEXT:    v_fma_f32 v1, v0, s4, -v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x32a5705f
-; GFX9-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX9-NEXT:    v_fma_f32 v2, v0, s4, v2
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX9-NEXT:    v_fma_f32 v1, v0, s4, v1
+; GFX9-NEXT:    v_add_f32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x42b17218
@@ -19383,44 +18284,44 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_exp_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
-; GFX10-NEXT:    v_fma_f32 v2, 0x3fb8aa3b, v0, -v1
-; GFX10-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX10-NEXT:    v_fmamk_f32 v2, v0, 0x32a5705f, v2
-; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX10-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX10-NEXT:    v_fma_f32 v3, 0x3fb8aa3b, v0, -v1
+; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX10-NEXT:    v_fmamk_f32 v3, v0, 0x32a5705f, v3
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_exp_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX11-NEXT:    v_fma_f32 v2, 0x3fb8aa3b, v0, -v1
-; GFX11-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX11-NEXT:    v_fma_f32 v3, 0x3fb8aa3b, v0, -v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX11-NEXT:    v_fmamk_f32 v3, v0, 0x32a5705f, v3
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
-; GFX11-NEXT:    v_fmamk_f32 v2, v0, 0x32a5705f, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX11-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_ldexp_f32 v1, v1, v2
@@ -19429,7 +18330,7 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.exp.bf16(bfloat %a)
   ret bfloat %op
@@ -19471,7 +18372,7 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX8-LABEL: v_exp2_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42800000
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -19481,52 +18382,52 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_exp2_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0xc2fc0000
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42800000
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_exp2_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_exp2_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.exp2.bf16(bfloat %a)
   ret bfloat %op
@@ -19588,7 +18489,7 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX8-LABEL: v_exp10_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v3, v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x40549000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v3
@@ -19609,23 +18510,23 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_exp10_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x40549a78
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
-; GFX9-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x40549a78
+; GFX9-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
+; GFX9-NEXT:    v_fma_f32 v1, v0, s4, -v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x33979a37
-; GFX9-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX9-NEXT:    v_fma_f32 v2, v0, s4, v2
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX9-NEXT:    v_fma_f32 v1, v0, s4, v1
+; GFX9-NEXT:    v_add_f32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0xc23369f4
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x421a209b
@@ -19634,44 +18535,44 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_exp10_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
-; GFX10-NEXT:    v_fma_f32 v2, 0x40549a78, v0, -v1
-; GFX10-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX10-NEXT:    v_fmamk_f32 v2, v0, 0x33979a37, v2
-; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX10-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX10-NEXT:    v_fma_f32 v3, 0x40549a78, v0, -v1
+; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX10-NEXT:    v_fmamk_f32 v3, v0, 0x33979a37, v3
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_exp10_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
-; GFX11-NEXT:    v_fma_f32 v2, 0x40549a78, v0, -v1
-; GFX11-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX11-NEXT:    v_fma_f32 v3, 0x40549a78, v0, -v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX11-NEXT:    v_fmamk_f32 v3, v0, 0x33979a37, v3
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
-; GFX11-NEXT:    v_fmamk_f32 v2, v0, 0x33979a37, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX11-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_ldexp_f32 v1, v1, v2
@@ -19680,7 +18581,7 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.exp10.bf16(bfloat %a)
   ret bfloat %op
@@ -19708,34 +18609,34 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX8-LABEL: v_ceil_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_ceil_f32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ceil_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_ceil_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ceil_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_ceil_f32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ceil_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ceil_f32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.ceil.bf16(bfloat %a)
   ret bfloat %op
@@ -19763,34 +18664,34 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX8-LABEL: v_trunc_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_trunc_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_trunc_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_trunc_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.trunc.bf16(bfloat %a)
   ret bfloat %op
@@ -19818,34 +18719,34 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX8-LABEL: v_rint_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_rint_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rint_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rint_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.rint.bf16(bfloat %a)
   ret bfloat %op
@@ -19873,34 +18774,34 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX8-LABEL: v_nearbyint_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_nearbyint_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_nearbyint_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_nearbyint_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.nearbyint.bf16(bfloat %a)
   ret bfloat %op
@@ -19940,7 +18841,7 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX8-LABEL: v_round_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; GFX8-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
@@ -19948,13 +18849,13 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_bfi_b32 v0, s4, v2, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_round_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
@@ -19962,26 +18863,26 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX9-NEXT:    s_brev_b32 s4, -2
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_round_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
 ; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_round_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
 ; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
@@ -19992,7 +18893,7 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.round.bf16(bfloat %a)
   ret bfloat %op
@@ -20020,34 +18921,34 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX8-LABEL: v_roundeven_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_roundeven_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.roundeven.bf16(bfloat %a)
   ret bfloat %op
@@ -20075,34 +18976,34 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX8-LABEL: v_floor_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_floor_f32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_floor_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_floor_f32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_floor_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_floor_f32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_floor_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_floor_f32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.floor.bf16(bfloat %a)
   ret bfloat %op
@@ -20124,21 +19025,34 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX8-LABEL: v_canonicalize_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_canonicalize_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_canonicalize_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_canonicalize_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.canonicalize.bf16(bfloat %a)
   ret bfloat %op
@@ -20214,8 +19128,8 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_oeq_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20223,8 +19137,8 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_oeq_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20232,8 +19146,8 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_oeq_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20241,8 +19155,8 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_oeq_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20273,8 +19187,8 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_ogt_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20282,8 +19196,8 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_ogt_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20291,8 +19205,8 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_ogt_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20300,8 +19214,8 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_ogt_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20332,8 +19246,8 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_oge_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20341,8 +19255,8 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_oge_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20350,8 +19264,8 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_oge_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20359,8 +19273,8 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_oge_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20391,8 +19305,8 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_olt_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20400,8 +19314,8 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_olt_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20409,8 +19323,8 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_olt_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20418,8 +19332,8 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_olt_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20450,8 +19364,8 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_ole_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20459,8 +19373,8 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_ole_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20468,8 +19382,8 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_ole_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20477,8 +19391,8 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_ole_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20509,8 +19423,8 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_one_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20518,8 +19432,8 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_one_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20527,8 +19441,8 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_one_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20536,8 +19450,8 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_one_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20568,8 +19482,8 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_uno_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20577,8 +19491,8 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_uno_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20586,8 +19500,8 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_uno_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20595,8 +19509,8 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_uno_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20627,8 +19541,8 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_ueq_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20636,8 +19550,8 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_ueq_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20645,8 +19559,8 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_ueq_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20654,8 +19568,8 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_ueq_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20686,8 +19600,8 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_ugt_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20695,8 +19609,8 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_ugt_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20704,8 +19618,8 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_ugt_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20713,8 +19627,8 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_ugt_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20745,8 +19659,8 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_uge_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20754,8 +19668,8 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_uge_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20763,8 +19677,8 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_uge_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20772,8 +19686,8 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_uge_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20804,8 +19718,8 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_ult_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20813,8 +19727,8 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_ult_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20822,8 +19736,8 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_ult_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20831,8 +19745,8 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_ult_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20863,8 +19777,8 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_ule_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20872,8 +19786,8 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_ule_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20881,8 +19795,8 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_ule_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20890,8 +19804,8 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_ule_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -20922,8 +19836,8 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX8-LABEL: v_fcmp_une_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -20931,8 +19845,8 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX9-LABEL: v_fcmp_une_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -20940,8 +19854,8 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX10-LABEL: v_fcmp_une_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -20949,8 +19863,8 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
 ; GFX11-LABEL: v_fcmp_une_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -21025,41 +19939,34 @@ define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
 ; GFX8-LABEL: v_copysign_bf16_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_bf16_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
   ret bfloat %op
@@ -21089,45 +19996,36 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
 ; GFX8-LABEL: v_copysign_bf16_s_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s4, s4, 0x80000000
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffff8000
+; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_s_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s5, 0x7fff
-; GFX9-NEXT:    s_and_b32 s4, s4, 0x80000000
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff8000
+; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_s_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10-NEXT:    s_and_b32 s4, s4, 0x80000000
-; GFX10-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e64 v1, 0xffff8000, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_bf16_s_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_and_b32 s0, s0, 0x80000000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    v_and_b32_e64 v1, 0xffff8000, s0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
   ret bfloat %op
@@ -21157,43 +20055,36 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) {
 ; GFX8-LABEL: v_copysign_s_bf16_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_s_bf16_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_s_bf16_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s4, s4, 16
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
 ; GFX10-NEXT:    v_and_b32_e64 v1, 0x7fff, s4
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_s_bf16_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
 ; GFX11-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
   ret bfloat %op
@@ -21223,41 +20114,35 @@ define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
 ; GFX8-LABEL: v_copysign_bf16_f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_bf16_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign = fptrunc float %sign.f32 to bfloat
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
@@ -21288,41 +20173,35 @@ define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
 ; GFX8-LABEL: v_copysign_bf16_f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_f64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_bf16_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign = fptrunc double %sign.f64 to bfloat
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
@@ -21353,40 +20232,34 @@ define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) {
 ; GFX8-LABEL: v_copysign_bf16_f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fff
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX9-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_bf16_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign = bitcast half %sign.f16 to bfloat
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
@@ -21412,50 +20285,43 @@ define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign
 ;
 ; GFX8-LABEL: s_copysign_bf16_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0x7fff
-; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX8-NEXT:    s_and_b32 s0, s1, 0x80000000
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX8-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_copysign_bf16_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
-; GFX9-NEXT:    s_and_b32 s0, s1, 0x80000000
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fff
+; GFX9-NEXT:    v_and_b32_e32 v0, s1, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, s0, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_copysign_bf16_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX10-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
-; GFX10-NEXT:    s_and_b32 s0, s1, 0x80000000
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT:    v_and_b32_e64 v0, 0xffff8000, s1
+; GFX10-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_bf16_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
-; GFX11-NEXT:    s_and_b32 s0, s1, 0x80000000
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT:    v_and_b32_e64 v0, 0xffff8000, s1
+; GFX11-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
@@ -21484,7 +20350,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f
 ;
 ; GFX8-LABEL: s_copysign_bf16_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    s_and_b32 s0, s1, 0x80000000
@@ -21496,7 +20361,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f
 ;
 ; GFX9-LABEL: s_copysign_bf16_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    s_and_b32 s0, s1, 0x80000000
@@ -21508,7 +20372,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f
 ;
 ; GFX10-LABEL: s_copysign_bf16_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
 ; GFX10-NEXT:    s_and_b32 s0, s1, 0x80000000
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
@@ -21519,10 +20382,9 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f
 ;
 ; GFX11-LABEL: s_copysign_bf16_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s1, 0x80000000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
@@ -21557,7 +20419,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.
 ;
 ; GFX8-LABEL: s_copysign_bf16_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX8-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    s_and_b32 s0, s2, 0x80000000
@@ -21569,7 +20430,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.
 ;
 ; GFX9-LABEL: s_copysign_bf16_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    s_and_b32 s0, s2, 0x80000000
@@ -21581,7 +20441,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.
 ;
 ; GFX10-LABEL: s_copysign_bf16_f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
 ; GFX10-NEXT:    s_and_b32 s0, s2, 0x80000000
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
@@ -21592,10 +20451,9 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.
 ;
 ; GFX11-LABEL: s_copysign_bf16_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_and_b32_e64 v0, 0x7fff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s2, 0x80000000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
@@ -21632,7 +20490,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f1
 ;
 ; GFX8-LABEL: s_copysign_bf16_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX8-NEXT:    v_and_b32_e32 v0, s1, v0
@@ -21644,7 +20501,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f1
 ;
 ; GFX9-LABEL: s_copysign_bf16_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX9-NEXT:    v_and_b32_e32 v0, s1, v0
@@ -21656,7 +20512,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f1
 ;
 ; GFX10-LABEL: s_copysign_bf16_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-NEXT:    v_and_b32_e64 v0, 0xffff8000, s1
 ; GFX10-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -21666,7 +20521,6 @@ define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f1
 ;
 ; GFX11-LABEL: s_copysign_bf16_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX11-NEXT:    v_and_b32_e64 v0, 0xffff8000, s1
 ; GFX11-NEXT:    v_and_b32_e64 v1, 0x7fff, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -21703,6 +20557,7 @@ define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
 ; GFX8-LABEL: v_copysign_f32_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -21710,6 +20565,7 @@ define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
 ; GFX9-LABEL: v_copysign_f32_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    s_brev_b32 s4, -2
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -21717,12 +20573,15 @@ define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
 ; GFX10-LABEL: v_copysign_f32_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_f32_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign = fpext bfloat %sign.bf16 to float
@@ -21751,32 +20610,32 @@ define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.b
 ;
 ; GFX8-LABEL: s_copysign_f32_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_brev_b32 s2, -2
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_bfi_b32 v0, s2, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, 16, s1
+; GFX8-NEXT:    s_brev_b32 s1, -2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    v_bfi_b32 v0, s1, v1, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_copysign_f32_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_brev_b32 s2, -2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_bfi_b32 v0, s2, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, 16, s1
+; GFX9-NEXT:    s_brev_b32 s1, -2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_bfi_b32 v0, s1, v1, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_copysign_f32_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s1
+; GFX10-NEXT:    v_lshlrev_b32_e64 v0, 16, s1
 ; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_f32_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s0, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
@@ -21816,7 +20675,6 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
 ; GFX8-LABEL: v_copysign_f16_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -21824,7 +20682,6 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
 ; GFX9-LABEL: v_copysign_f16_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -21832,15 +20689,12 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
 ; GFX10-LABEL: v_copysign_f16_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_f16_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign = bitcast bfloat %sign.bf16 to half
@@ -21875,7 +20729,6 @@ define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf
 ;
 ; GFX8-LABEL: s_copysign_f16_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -21886,7 +20739,6 @@ define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf
 ;
 ; GFX9-LABEL: s_copysign_f16_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -21897,7 +20749,6 @@ define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf
 ;
 ; GFX10-LABEL: s_copysign_f16_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -21906,12 +20757,11 @@ define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf
 ;
 ; GFX11-LABEL: s_copysign_f16_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %sign = bitcast bfloat %sign.bf16 to half
@@ -21942,6 +20792,7 @@ define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
 ; GFX8-LABEL: v_copysign_f64_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_bfi_b32 v1, s4, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -21949,6 +20800,7 @@ define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
 ; GFX9-LABEL: v_copysign_f64_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    s_brev_b32 s4, -2
 ; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -21956,12 +20808,15 @@ define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
 ; GFX10-LABEL: v_copysign_f64_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_f64_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign = fpext bfloat %sign.bf16 to double
@@ -21990,32 +20845,32 @@ define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg
 ;
 ; GFX8-LABEL: s_copysign_f64_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_brev_b32 s3, -2
-; GFX8-NEXT:    v_mov_b32_e32 v0, s1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    v_bfi_b32 v0, s3, v0, v1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, 16, s2
+; GFX8-NEXT:    s_brev_b32 s2, -2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_bfi_b32 v0, s2, v1, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_copysign_f64_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_brev_b32 s3, -2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_bfi_b32 v0, s3, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v0, 16, s2
+; GFX9-NEXT:    s_brev_b32 s2, -2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_bfi_b32 v0, s2, v1, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_copysign_f64_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_lshlrev_b32_e64 v0, 16, s2
 ; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, s1, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_f64_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s1, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
@@ -22050,28 +20905,28 @@ define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
 ; GFX8-LABEL: v_fptosi_bf16_to_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fptosi_bf16_to_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_bf16_to_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fptosi_bf16_to_i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -22188,13 +21043,13 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
 ; GFX8-LABEL: v_fptosi_v3bf16_to_v3i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16:
@@ -22285,65 +21140,65 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
 ; GFX8-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX8-NEXT:    v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fptosi <4 x bfloat> %x to <4 x i16>
   ret <4 x i16> %op
@@ -22367,28 +21222,28 @@ define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
 ; GFX8-LABEL: v_fptosi_bf16_to_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fptosi_bf16_to_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_bf16_to_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fptosi_bf16_to_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -22663,7 +21518,7 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
 ; GFX8-LABEL: v_fptosi_bf16_to_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX8-NEXT:    s_mov_b32 s4, 0x2f800000
 ; GFX8-NEXT:    v_mul_f32_e64 v1, |v0|, s4
@@ -22682,26 +21537,26 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
 ; GFX9-LABEL: v_fptosi_bf16_to_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
 ; GFX9-NEXT:    v_mul_f32_e64 v1, |v0|, s4
 ; GFX9-NEXT:    v_floor_f32_e32 v1, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0xcf800000
-; GFX9-NEXT:    v_fma_f32 v2, v1, s4, |v0|
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX9-NEXT:    v_fma_f32 v1, v1, s4, |v0|
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v3
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v3
+; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_bf16_to_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
@@ -22718,7 +21573,7 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
 ; GFX11-LABEL: v_fptosi_bf16_to_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX11-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
@@ -23557,21 +22412,21 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_i16_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_i16_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_i16_to_bf16:
@@ -23580,7 +22435,7 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp i16 %x to bfloat
   ret bfloat %op
@@ -23614,8 +22469,8 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16:
@@ -23623,8 +22478,8 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16:
@@ -23632,7 +22487,7 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v2i16_to_v2bf16:
@@ -23644,7 +22499,7 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <2 x i16> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -23682,23 +22537,26 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8-LABEL: v_sitofp_v3i16_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16:
@@ -23706,9 +22564,10 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v3i16_to_v3bf16:
@@ -23716,15 +22575,17 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 16, v1
 ; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i16> %x to <3 x bfloat>
   ret <3 x bfloat> %op
@@ -23768,59 +22629,55 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_bfe_i32 v2, v1, 0, 16
-; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
+; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 16, v1
+; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i16> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -23845,21 +22702,21 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_i32_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_i32_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_i32_to_bf16:
@@ -23867,7 +22724,7 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp i32 %x to bfloat
   ret bfloat %op
@@ -23897,34 +22754,34 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v2i32_to_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v2i32_to_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <2 x i32> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -23956,45 +22813,34 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8-LABEL: v_sitofp_v3i32_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v1
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i32_to_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_sitofp_v3i32_to_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i32> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -24029,53 +22875,49 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX8-NEXT:    v_perm_b32 v1, v2, v3, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i32> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -24133,7 +22975,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_i64_to_bf16:
@@ -24151,7 +22993,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_i64_to_bf16:
@@ -24169,7 +23011,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_i64_to_bf16:
@@ -24192,7 +23034,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp i64 %x to bfloat
   ret bfloat %op
@@ -24264,72 +23106,72 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX8-LABEL: v_sitofp_v2i64_to_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v5, v2, v3
-; GFX8-NEXT:    v_ffbh_i32_e32 v4, v3
+; GFX8-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX8-NEXT:    v_ffbh_i32_e32 v4, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
 ; GFX8-NEXT:    v_min_u32_e32 v4, v4, v5
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX8-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX8-NEXT:    v_ffbh_i32_e32 v3, v1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
-; GFX8-NEXT:    v_min_u32_e32 v3, v3, v5
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v5, v0
+; GFX8-NEXT:    v_ffbh_i32_e32 v0, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
+; GFX8-NEXT:    v_min_u32_e32 v6, v0, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v4
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 32, v4
-; GFX8-NEXT:    v_ldexp_f32 v1, v2, v4
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v3
+; GFX8-NEXT:    v_ldexp_f32 v1, v5, v2
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v5, v2, v3
-; GFX9-NEXT:    v_ffbh_i32_e32 v4, v3
+; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX9-NEXT:    v_ffbh_i32_e32 v4, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
 ; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
 ; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
 ; GFX9-NEXT:    v_min_u32_e32 v4, v4, v5
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX9-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX9-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX9-NEXT:    v_ffbh_i32_e32 v3, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
-; GFX9-NEXT:    v_min_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX9-NEXT:    v_min_u32_e32 v6, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v4
 ; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v3
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v6
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v2i64_to_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v4, v2, v3
-; GFX10-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX10-NEXT:    v_ffbh_i32_e32 v6, v3
-; GFX10-NEXT:    v_ffbh_i32_e32 v7, v1
+; GFX10-NEXT:    v_xor_b32_e32 v4, v0, v1
+; GFX10-NEXT:    v_xor_b32_e32 v5, v2, v3
+; GFX10-NEXT:    v_ffbh_i32_e32 v6, v1
+; GFX10-NEXT:    v_ffbh_i32_e32 v7, v3
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, -1, v6
@@ -24338,28 +23180,28 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 32, v5
 ; GFX10-NEXT:    v_min_u32_e32 v4, v6, v4
 ; GFX10-NEXT:    v_min_u32_e32 v5, v7, v5
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
-; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v4
+; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 32, v4
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v2i64_to_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v4, v2, v3
-; GFX11-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX11-NEXT:    v_cls_i32_e32 v6, v3
-; GFX11-NEXT:    v_cls_i32_e32 v7, v1
+; GFX11-NEXT:    v_xor_b32_e32 v4, v0, v1
+; GFX11-NEXT:    v_xor_b32_e32 v5, v2, v3
+; GFX11-NEXT:    v_cls_i32_e32 v6, v1
+; GFX11-NEXT:    v_cls_i32_e32 v7, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
@@ -24373,24 +23215,24 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX11-NEXT:    v_min_u32_e32 v4, v6, v4
 ; GFX11-NEXT:    v_min_u32_e32 v5, v7, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v4
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 32, v4
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <2 x i64> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -24488,188 +23330,133 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-LABEL: v_sitofp_v3i64_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v7, v4, v5
-; GFX8-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX8-NEXT:    v_xor_b32_e32 v7, v0, v1
+; GFX8-NEXT:    v_ffbh_i32_e32 v6, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, -1, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 32, v7
 ; GFX8-NEXT:    v_min_u32_e32 v6, v6, v7
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
-; GFX8-NEXT:    v_xor_b32_e32 v6, v2, v3
-; GFX8-NEXT:    v_ldexp_f32 v5, v4, v5
-; GFX8-NEXT:    v_ffbh_i32_e32 v4, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v6
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 32, v6
-; GFX8-NEXT:    v_min_u32_e32 v6, v4, v6
-; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v6, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_min_u32_e32 v3, 1, v3
-; GFX8-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    v_ffbh_i32_e32 v4, v1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
-; GFX8-NEXT:    v_min_u32_e32 v4, v4, v5
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v7, v0
+; GFX8-NEXT:    v_ffbh_i32_e32 v0, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
+; GFX8-NEXT:    v_min_u32_e32 v8, v0, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
-; GFX8-NEXT:    v_ldexp_f32 v1, v3, v5
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
+; GFX8-NEXT:    v_ldexp_f32 v2, v7, v2
+; GFX8-NEXT:    v_ldexp_f32 v3, v0, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, v4, v5
+; GFX8-NEXT:    v_ffbh_i32_e32 v0, v5
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
+; GFX8-NEXT:    v_min_u32_e32 v6, v0, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v7, v4, v5
-; GFX9-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX9-NEXT:    v_xor_b32_e32 v7, v0, v1
+; GFX9-NEXT:    v_ffbh_i32_e32 v6, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
 ; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
 ; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
 ; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
-; GFX9-NEXT:    v_xor_b32_e32 v6, v2, v3
-; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v6
-; GFX9-NEXT:    v_ldexp_f32 v5, v4, v5
-; GFX9-NEXT:    v_ffbh_i32_e32 v4, v3
-; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
-; GFX9-NEXT:    v_add_u32_e32 v6, 32, v6
-; GFX9-NEXT:    v_min_u32_e32 v6, v4, v6
-; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v6, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_min_u32_e32 v3, 1, v3
-; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX9-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX9-NEXT:    v_ffbh_i32_e32 v4, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
-; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
-; GFX9-NEXT:    v_min_u32_e32 v4, v4, v5
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v6
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
-; GFX9-NEXT:    v_ldexp_f32 v1, v3, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v4
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v7, v0
+; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX9-NEXT:    v_min_u32_e32 v8, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX9-NEXT:    v_ldexp_f32 v2, v7, v6
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, v4, v5
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v0
+; GFX9-NEXT:    v_ffbh_i32_e32 v0, v5
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX9-NEXT:    v_min_u32_e32 v7, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v8
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v0
+; GFX9-NEXT:    v_ldexp_f32 v3, v3, v6
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v3, v2, s4
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v7, v2, v3
-; GFX10-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GFX10-NEXT:    v_xor_b32_e32 v9, v0, v1
-; GFX10-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX10-NEXT:    v_xor_b32_e32 v7, v0, v1
+; GFX10-NEXT:    v_xor_b32_e32 v8, v2, v3
+; GFX10-NEXT:    v_xor_b32_e32 v9, v4, v5
+; GFX10-NEXT:    v_ffbh_i32_e32 v6, v1
 ; GFX10-NEXT:    v_ffbh_i32_e32 v10, v3
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
-; GFX10-NEXT:    v_ffbh_i32_e32 v11, v1
-; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX10-NEXT:    v_ffbh_i32_e32 v11, v5
+; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, -1, v6
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v10
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 32, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v10
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, -1, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 32, v9
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
-; GFX10-NEXT:    v_min_u32_e32 v7, v10, v7
-; GFX10-NEXT:    v_min_u32_e32 v9, v11, v9
-; GFX10-NEXT:    v_min_u32_e32 v6, v6, v8
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX10-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX10-NEXT:    v_min_u32_e32 v7, v10, v8
+; GFX10-NEXT:    v_min_u32_e32 v8, v11, v9
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
-; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, v5, v4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
+; GFX10-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v4
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v6
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_sitofp_v3i64_to_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v7, v2, v3
-; GFX11-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GFX11-NEXT:    v_xor_b32_e32 v9, v0, v1
-; GFX11-NEXT:    v_cls_i32_e32 v6, v5
-; GFX11-NEXT:    v_cls_i32_e32 v10, v3
-; GFX11-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
-; GFX11-NEXT:    v_cls_i32_e32 v11, v1
-; GFX11-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
-; GFX11-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, -1, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 32, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, -1, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 32, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 32, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_u32_e32 v7, v10, v7
-; GFX11-NEXT:    v_min_u32_e32 v9, v11, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_u32_e32 v6, v6, v8
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v5, v4
-; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i64> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -24792,234 +23579,236 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v9, v6, v7
-; GFX8-NEXT:    v_ffbh_i32_e32 v8, v7
-; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, -1, v8
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 32, v9
-; GFX8-NEXT:    v_min_u32_e32 v8, v8, v9
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v8, v[6:7]
-; GFX8-NEXT:    v_xor_b32_e32 v9, v4, v5
-; GFX8-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX8-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 32, v8
-; GFX8-NEXT:    v_ffbh_i32_e32 v8, v5
+; GFX8-NEXT:    v_xor_b32_e32 v9, v0, v1
+; GFX8-NEXT:    v_ffbh_i32_e32 v8, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, -1, v8
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 32, v9
 ; GFX8-NEXT:    v_min_u32_e32 v8, v8, v9
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v6, v6
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
-; GFX8-NEXT:    v_xor_b32_e32 v8, v2, v3
-; GFX8-NEXT:    v_ldexp_f32 v6, v6, v7
-; GFX8-NEXT:    v_ffbh_i32_e32 v7, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, -1, v7
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v8
-; GFX8-NEXT:    v_min_u32_e32 v7, v7, v8
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
-; GFX8-NEXT:    v_ldexp_f32 v4, v4, v5
-; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX8-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX8-NEXT:    v_ffbh_i32_e32 v3, v1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
-; GFX8-NEXT:    v_min_u32_e32 v3, v3, v5
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v9, v0
+; GFX8-NEXT:    v_ffbh_i32_e32 v0, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
+; GFX8-NEXT:    v_min_u32_e32 v10, v0, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v7
-; GFX8-NEXT:    v_ldexp_f32 v1, v2, v5
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v3
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX8-NEXT:    v_perm_b32 v1, v4, v6, s4
+; GFX8-NEXT:    v_ldexp_f32 v3, v9, v2
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v10
+; GFX8-NEXT:    v_xor_b32_e32 v2, v4, v5
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    v_ffbh_i32_e32 v1, v5
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -1, v1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
+; GFX8-NEXT:    v_min_u32_e32 v8, v1, v2
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_xor_b32_e32 v2, v6, v7
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v1
+; GFX8-NEXT:    v_ffbh_i32_e32 v1, v7
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -1, v1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
+; GFX8-NEXT:    v_min_u32_e32 v4, v1, v2
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
+; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT:    v_ldexp_f32 v2, v3, v5
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v9, v4, v5
-; GFX9-NEXT:    v_ffbh_i32_e32 v8, v5
+; GFX9-NEXT:    v_xor_b32_e32 v9, v0, v1
+; GFX9-NEXT:    v_ffbh_i32_e32 v8, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
 ; GFX9-NEXT:    v_add_u32_e32 v8, -1, v8
 ; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
 ; GFX9-NEXT:    v_min_u32_e32 v8, v8, v9
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX9-NEXT:    v_sub_u32_e32 v8, 32, v8
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_xor_b32_e32 v5, v6, v7
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v9, v4
-; GFX9-NEXT:    v_ffbh_i32_e32 v4, v7
-; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
-; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
-; GFX9-NEXT:    v_min_u32_e32 v10, v4, v5
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX9-NEXT:    v_ldexp_f32 v6, v9, v8
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_xor_b32_e32 v8, v2, v3
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_ffbh_i32_e32 v7, v3
-; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX9-NEXT:    v_add_u32_e32 v7, -1, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
-; GFX9-NEXT:    v_min_u32_e32 v7, v7, v8
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v10
-; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX9-NEXT:    v_ffbh_i32_e32 v3, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
-; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
-; GFX9-NEXT:    v_min_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v9, v1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX9-NEXT:    v_min_u32_e32 v10, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v9
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v3
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GFX10-NEXT:    v_ffbh_i32_e32 v9, v5
-; GFX10-NEXT:    v_xor_b32_e32 v11, v6, v7
-; GFX10-NEXT:    v_ffbh_i32_e32 v10, v7
-; GFX10-NEXT:    v_xor_b32_e32 v13, v2, v3
-; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v9
-; GFX10-NEXT:    v_xor_b32_e32 v15, v0, v1
-; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
-; GFX10-NEXT:    v_ffbh_i32_e32 v12, v3
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v8
+; GFX9-NEXT:    v_ldexp_f32 v3, v2, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v10
+; GFX9-NEXT:    v_xor_b32_e32 v2, v4, v5
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX9-NEXT:    v_ffbh_i32_e32 v1, v5
+; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX9-NEXT:    v_min_u32_e32 v8, v1, v2
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX9-NEXT:    v_or_b32_e32 v3, v2, v1
+; GFX9-NEXT:    v_xor_b32_e32 v2, v6, v7
+; GFX9-NEXT:    v_ffbh_i32_e32 v1, v7
+; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX9-NEXT:    v_min_u32_e32 v4, v1, v2
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
+; GFX9-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v4
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_xor_b32_e32 v8, v0, v1
+; GFX10-NEXT:    v_ffbh_i32_e32 v9, v1
+; GFX10-NEXT:    v_ffbh_i32_e32 v10, v3
+; GFX10-NEXT:    v_xor_b32_e32 v11, v2, v3
+; GFX10-NEXT:    v_xor_b32_e32 v13, v4, v5
+; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v9
+; GFX10-NEXT:    v_xor_b32_e32 v15, v6, v7
+; GFX10-NEXT:    v_ffbh_i32_e32 v12, v5
+; GFX10-NEXT:    v_ffbh_i32_e32 v14, v7
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
-; GFX10-NEXT:    v_ffbh_i32_e32 v14, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v11, 32, v11
+; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v12, -1, v12
+; GFX10-NEXT:    v_add_nc_u32_e32 v14, -1, v14
 ; GFX10-NEXT:    v_min_u32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v10
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v10, 31, v13
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v13, 31, v15
-; GFX10-NEXT:    v_add_nc_u32_e32 v14, -1, v14
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX10-NEXT:    v_min_u32_e32 v9, v9, v11
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, 32, v11
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 32, v10
 ; GFX10-NEXT:    v_add_nc_u32_e32 v13, 32, v13
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v9, v[6:7]
+; GFX10-NEXT:    v_min_u32_e32 v9, v9, v11
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_min_u32_e32 v10, v12, v10
 ; GFX10-NEXT:    v_min_u32_e32 v11, v14, v13
-; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v11, v[0:1]
-; GFX10-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
 ; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX10-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_min_u32_e32 v1, 1, v4
+; GFX10-NEXT:    v_min_u32_e32 v4, 1, v6
 ; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v5
-; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
+; GFX10-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX10-NEXT:    v_or_b32_e32 v4, v7, v4
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 32, v11
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v5
-; GFX10-NEXT:    v_ldexp_f32 v3, v4, v3
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v6
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v7
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v5
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX10-NEXT:    v_ldexp_f32 v3, v4, v7
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GFX11-NEXT:    v_cls_i32_e32 v9, v5
-; GFX11-NEXT:    v_xor_b32_e32 v11, v6, v7
-; GFX11-NEXT:    v_cls_i32_e32 v10, v7
-; GFX11-NEXT:    v_xor_b32_e32 v13, v2, v3
+; GFX11-NEXT:    v_xor_b32_e32 v8, v0, v1
+; GFX11-NEXT:    v_cls_i32_e32 v9, v1
+; GFX11-NEXT:    v_cls_i32_e32 v10, v3
+; GFX11-NEXT:    v_xor_b32_e32 v11, v2, v3
+; GFX11-NEXT:    v_xor_b32_e32 v13, v4, v5
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
 ; GFX11-NEXT:    v_add_nc_u32_e32 v9, -1, v9
-; GFX11-NEXT:    v_xor_b32_e32 v14, v0, v1
-; GFX11-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
-; GFX11-NEXT:    v_cls_i32_e32 v12, v3
+; GFX11-NEXT:    v_xor_b32_e32 v15, v6, v7
+; GFX11-NEXT:    v_cls_i32_e32 v12, v5
+; GFX11-NEXT:    v_cls_i32_e32 v14, v7
 ; GFX11-NEXT:    v_add_nc_u32_e32 v8, 32, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v10
-; GFX11-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 32, v11
+; GFX11-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v12, -1, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, -1, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_u32_e32 v8, v9, v8
-; GFX11-NEXT:    v_ashrrev_i32_e32 v9, 31, v13
-; GFX11-NEXT:    v_cls_i32_e32 v13, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 32, v14
-; GFX11-NEXT:    v_min_u32_e32 v10, v10, v11
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 32, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, -1, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v10, v[6:7]
-; GFX11-NEXT:    v_min_u32_e32 v9, v12, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_u32_e32 v11, v13, v14
-; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v11, v[0:1]
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX11-NEXT:    v_min_u32_e32 v5, 1, v6
-; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v8
-; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, -1, v10
+; GFX11-NEXT:    v_ashrrev_i32_e32 v10, 31, v13
+; GFX11-NEXT:    v_ashrrev_i32_e32 v13, 31, v15
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 32, v11
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 32, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 32, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_u32_e32 v9, v9, v11
 ; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_min_u32_e32 v10, v12, v10
+; GFX11-NEXT:    v_min_u32_e32 v11, v14, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_ldexp_f32 v1, v4, v6
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v5
-; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v10
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_u32_e32 v1, 1, v4
+; GFX11-NEXT:    v_min_u32_e32 v4, 1, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
+; GFX11-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX11-NEXT:    v_or_b32_e32 v4, v7, v4
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
-; GFX11-NEXT:    v_ldexp_f32 v3, v3, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_sub_nc_u32_e32 v7, 32, v11
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v3
 ; GFX11-NEXT:    v_ldexp_f32 v2, v2, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX11-NEXT:    v_ldexp_f32 v3, v4, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i64> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -25046,21 +23835,21 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_i16_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i16_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_i16_to_bf16:
@@ -25069,7 +23858,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp i16 %x to bfloat
   ret bfloat %op
@@ -25103,37 +23892,37 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v2i16_to_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v2i16_to_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <2 x i16> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -25171,49 +23960,55 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v3i16_to_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i16> %x to <3 x bfloat>
   ret <3 x bfloat> %op
@@ -25257,59 +24052,55 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i16> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -25334,21 +24125,21 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_i32_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i32_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_i32_to_bf16:
@@ -25356,7 +24147,7 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp i32 %x to bfloat
   ret bfloat %op
@@ -25386,34 +24177,34 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v2i32_to_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v2i32_to_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <2 x i32> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -25445,45 +24236,34 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8-LABEL: v_uitofp_v3i32_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i32_to_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_uitofp_v3i32_to_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i32> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -25518,53 +24298,49 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX8-NEXT:    v_perm_b32 v1, v2, v3, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i32> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -25610,7 +24386,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_i64_to_bf16:
@@ -25624,7 +24400,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i64_to_bf16:
@@ -25638,7 +24414,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_i64_to_bf16:
@@ -25656,7 +24432,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp i64 %x to bfloat
   ret bfloat %op
@@ -25712,45 +24488,45 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX8-LABEL: v_uitofp_v2i64_to_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_ffbh_u32_e32 v4, v3
+; GFX8-NEXT:    v_ffbh_u32_e32 v4, v1
 ; GFX8-NEXT:    v_min_u32_e32 v4, 32, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 32, v4
-; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX8-NEXT:    v_ffbh_u32_e32 v3, v1
-; GFX8-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, v0
+; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
+; GFX8-NEXT:    v_min_u32_e32 v6, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v4
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_ldexp_f32 v1, v2, v4
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v3
+; GFX8-NEXT:    v_ldexp_f32 v1, v5, v2
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ffbh_u32_e32 v4, v3
+; GFX9-NEXT:    v_ffbh_u32_e32 v4, v1
 ; GFX9-NEXT:    v_min_u32_e32 v4, 32, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX9-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX9-NEXT:    v_ffbh_u32_e32 v3, v1
-; GFX9-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
+; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
+; GFX9-NEXT:    v_min_u32_e32 v6, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v4
 ; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v3
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v6
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -25758,52 +24534,52 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX10-LABEL: v_uitofp_v2i64_to_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v4, v3
-; GFX10-NEXT:    v_ffbh_u32_e32 v5, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v4, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v5, v3
 ; GFX10-NEXT:    v_min_u32_e32 v4, 32, v4
 ; GFX10-NEXT:    v_min_u32_e32 v5, 32, v5
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
-; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v4
+; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 32, v4
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v2i64_to_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v4, v3
-; GFX11-NEXT:    v_clz_i32_u32_e32 v5, v1
+; GFX11-NEXT:    v_clz_i32_u32_e32 v4, v1
+; GFX11-NEXT:    v_clz_i32_u32_e32 v5, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_min_u32_e32 v4, 32, v4
 ; GFX11-NEXT:    v_min_u32_e32 v5, 32, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v4
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 32, v4
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <2 x i64> %x to <2 x bfloat>
   ret <2 x bfloat> %op
@@ -25877,139 +24653,97 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-LABEL: v_uitofp_v3i64_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_ffbh_u32_e32 v6, v5
+; GFX8-NEXT:    v_ffbh_u32_e32 v6, v1
 ; GFX8-NEXT:    v_min_u32_e32 v6, 32, v6
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
-; GFX8-NEXT:    v_ldexp_f32 v5, v4, v5
-; GFX8-NEXT:    v_ffbh_u32_e32 v4, v3
-; GFX8-NEXT:    v_min_u32_e32 v6, 32, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v6, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_min_u32_e32 v3, 1, v3
-; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX8-NEXT:    v_ffbh_u32_e32 v4, v1
-; GFX8-NEXT:    v_min_u32_e32 v4, 32, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v7, v0
+; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
+; GFX8-NEXT:    v_min_u32_e32 v8, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
-; GFX8-NEXT:    v_ldexp_f32 v1, v3, v5
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v2
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
+; GFX8-NEXT:    v_ldexp_f32 v2, v7, v2
+; GFX8-NEXT:    v_ldexp_f32 v3, v0, v1
+; GFX8-NEXT:    v_ffbh_u32_e32 v0, v5
+; GFX8-NEXT:    v_min_u32_e32 v6, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ffbh_u32_e32 v6, v5
+; GFX9-NEXT:    v_ffbh_u32_e32 v6, v1
 ; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
-; GFX9-NEXT:    v_ldexp_f32 v5, v4, v5
-; GFX9-NEXT:    v_ffbh_u32_e32 v4, v3
-; GFX9-NEXT:    v_min_u32_e32 v6, 32, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v6, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_min_u32_e32 v3, 1, v3
-; GFX9-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX9-NEXT:    v_ffbh_u32_e32 v4, v1
-; GFX9-NEXT:    v_min_u32_e32 v4, 32, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v6
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
-; GFX9-NEXT:    v_ldexp_f32 v1, v3, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v4
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, v0
+; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
+; GFX9-NEXT:    v_min_u32_e32 v8, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX9-NEXT:    v_ldexp_f32 v2, v7, v6
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v0
+; GFX9-NEXT:    v_ffbh_u32_e32 v0, v5
+; GFX9-NEXT:    v_min_u32_e32 v7, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v8
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v0
+; GFX9-NEXT:    v_ldexp_f32 v3, v3, v6
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v3, v2, s4
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v6, v3
-; GFX10-NEXT:    v_ffbh_u32_e32 v7, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v6, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v7, v3
 ; GFX10-NEXT:    v_ffbh_u32_e32 v8, v5
 ; GFX10-NEXT:    v_min_u32_e32 v6, 32, v6
 ; GFX10-NEXT:    v_min_u32_e32 v7, 32, v7
 ; GFX10-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
-; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, v5, v4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX10-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v4
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v8
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v6
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_uitofp_v3i64_to_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v6, v3
-; GFX11-NEXT:    v_clz_i32_u32_e32 v7, v1
-; GFX11-NEXT:    v_clz_i32_u32_e32 v8, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_u32_e32 v6, 32, v6
-; GFX11-NEXT:    v_min_u32_e32 v7, 32, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v5, v4
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
-; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, v8
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i64> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -26100,131 +24834,130 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_ffbh_u32_e32 v8, v7
+; GFX8-NEXT:    v_ffbh_u32_e32 v8, v1
 ; GFX8-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v8, v[6:7]
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX8-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v6, v6
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 32, v8
-; GFX8-NEXT:    v_ffbh_u32_e32 v8, v5
-; GFX8-NEXT:    v_ldexp_f32 v6, v6, v7
-; GFX8-NEXT:    v_ffbh_u32_e32 v7, v3
-; GFX8-NEXT:    v_min_u32_e32 v7, 32, v7
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
-; GFX8-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX8-NEXT:    v_ffbh_u32_e32 v3, v1
-; GFX8-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v9, v0
+; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
+; GFX8-NEXT:    v_min_u32_e32 v10, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v10
+; GFX8-NEXT:    v_ldexp_f32 v3, v9, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    v_ffbh_u32_e32 v1, v5
+; GFX8-NEXT:    v_min_u32_e32 v8, 32, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; GFX8-NEXT:    v_ffbh_u32_e32 v1, v7
+; GFX8-NEXT:    v_min_u32_e32 v4, 32, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
 ; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
-; GFX8-NEXT:    v_ldexp_f32 v4, v4, v5
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v7
-; GFX8-NEXT:    v_ldexp_f32 v1, v2, v5
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v3
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX8-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX8-NEXT:    v_perm_b32 v1, v4, v6, s4
+; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX8-NEXT:    v_ldexp_f32 v2, v3, v5
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ffbh_u32_e32 v8, v5
+; GFX9-NEXT:    v_ffbh_u32_e32 v8, v1
 ; GFX9-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX9-NEXT:    v_sub_u32_e32 v8, 32, v8
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, v4
-; GFX9-NEXT:    v_ffbh_u32_e32 v4, v7
-; GFX9-NEXT:    v_min_u32_e32 v10, 32, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
-; GFX9-NEXT:    v_ffbh_u32_e32 v7, v3
-; GFX9-NEXT:    v_min_u32_e32 v7, 32, v7
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX9-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX9-NEXT:    v_ffbh_u32_e32 v3, v1
-; GFX9-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_or_b32_e32 v9, v1, v0
+; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
+; GFX9-NEXT:    v_min_u32_e32 v10, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v9
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v10
-; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v3
-; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_ldexp_f32 v6, v9, v8
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v8
+; GFX9-NEXT:    v_ldexp_f32 v3, v2, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v10
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX9-NEXT:    v_ffbh_u32_e32 v1, v5
+; GFX9-NEXT:    v_min_u32_e32 v8, 32, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX9-NEXT:    v_or_b32_e32 v3, v2, v1
+; GFX9-NEXT:    v_ffbh_u32_e32 v1, v7
+; GFX9-NEXT:    v_min_u32_e32 v4, 32, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
+; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
+; GFX9-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v4
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v8, v5
-; GFX10-NEXT:    v_ffbh_u32_e32 v9, v7
-; GFX10-NEXT:    v_ffbh_u32_e32 v10, v3
-; GFX10-NEXT:    v_ffbh_u32_e32 v11, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v8, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v9, v3
+; GFX10-NEXT:    v_ffbh_u32_e32 v10, v5
+; GFX10-NEXT:    v_ffbh_u32_e32 v11, v7
 ; GFX10-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX10-NEXT:    v_min_u32_e32 v9, 32, v9
 ; GFX10-NEXT:    v_min_u32_e32 v10, 32, v10
 ; GFX10-NEXT:    v_min_u32_e32 v11, 32, v11
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v9, v[6:7]
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v11, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX10-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v8
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v4
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v5
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
+; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX10-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX10-NEXT:    v_or_b32_e32 v4, v7, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v9
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v10
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 32, v11
-; GFX10-NEXT:    v_ldexp_f32 v3, v3, v4
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v8
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX10-NEXT:    v_ldexp_f32 v2, v2, v5
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v7
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_ldexp_f32 v3, v4, v6
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v8, v5
-; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v7
-; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v3
-; GFX11-NEXT:    v_clz_i32_u32_e32 v11, v1
+; GFX11-NEXT:    v_clz_i32_u32_e32 v8, v1
+; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v3
+; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v5
+; GFX11-NEXT:    v_clz_i32_u32_e32 v11, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX11-NEXT:    v_min_u32_e32 v9, 32, v9
@@ -26232,40 +24965,37 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX11-NEXT:    v_min_u32_e32 v10, 32, v10
 ; GFX11-NEXT:    v_min_u32_e32 v11, 32, v11
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v9, v[6:7]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v11, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
-; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 32, v9
+; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v4
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v5
-; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v11
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, v8
-; GFX11-NEXT:    v_ldexp_f32 v3, v3, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_ldexp_f32 v2, v2, v6
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX11-NEXT:    v_or_b32_e32 v4, v7, v6
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v9
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v10
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v8
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v3, v4, v6
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i64> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -26294,46 +25024,33 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, bfloat %a, bfloat %b
   ret bfloat %op
@@ -26364,50 +25081,37 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_fneg_lhs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_fneg_lhs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_fneg_lhs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg bfloat %a
   %op = select i1 %cond, bfloat %neg.a, bfloat %b
@@ -26439,50 +25143,37 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_fneg_rhs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_fneg_rhs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_fneg_rhs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg bfloat %b
   %op = select i1 %cond, bfloat %a, bfloat %neg.b
@@ -26524,11 +25215,11 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -26537,13 +25228,13 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v2bf16:
@@ -26555,8 +25246,7 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v2bf16:
@@ -26568,11 +25258,8 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
   ret <2 x bfloat> %op
@@ -26611,11 +25298,11 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -26626,28 +25313,27 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_vselect_v2bf16:
@@ -26660,12 +25346,9 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
   ret <2 x bfloat> %op
@@ -26694,8 +25377,6 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
 ;
 ; GFX8-LABEL: s_select_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -26706,8 +25387,6 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
 ;
 ; GFX9-LABEL: s_select_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -26718,25 +25397,21 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
 ;
 ; GFX10-LABEL: s_select_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    s_lshr_b32 s0, s1, 16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s1, v1, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_select_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-NEXT:    s_lshr_b32 s0, s1, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s0, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s1, v1, vcc_lo
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %cond = icmp eq i32 %c, 0
@@ -26810,23 +25485,22 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_select_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s2
-; GFX10-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s1, v1, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s1, v2, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -26838,11 +25512,10 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, s1, v2 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s1, v2, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %cond = icmp eq i32 %c, 0
@@ -26913,9 +25586,9 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -26924,13 +25597,12 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX10-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s2, v2, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s0
+; GFX10-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s1, v2, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s1, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -26943,11 +25615,10 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, s0, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, s1, v3 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s1, v3, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %cond = icmp eq <2 x i32> %c, zeroinitializer
@@ -27000,30 +25671,37 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v3bf16:
@@ -27032,13 +25710,15 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v3bf16:
@@ -27046,17 +25726,17 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v2, v4, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v3, v6, v5 :: v_dual_cndmask_b32 v2, v4, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX11-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
   ret <3 x bfloat> %op
@@ -27113,38 +25793,37 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v7, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v4bf16:
@@ -27153,17 +25832,15 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v4bf16:
@@ -27173,20 +25850,15 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v2, v4, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v6, v5 :: v_dual_cndmask_b32 v0, v7, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v3, v6, v5 :: v_dual_cndmask_b32 v2, v4, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v4
+; GFX11-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
   ret <4 x bfloat> %op
@@ -27259,105 +25931,92 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v12, v11, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v6bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v6bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v9, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v5, v3, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v6bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v3, v6, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v5, v2 :: v_dual_cndmask_b32 v5, v10, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v4, v7 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v5
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v5, v3, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
   ret <6 x bfloat> %op
@@ -27446,62 +26105,59 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v15, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v12, v11, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v14, v13, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v12, v11, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v14, v13, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v16, v15, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; GFX9-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v8bf16:
@@ -27510,63 +26166,52 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v13, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v15, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v2, v5, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v6, v3, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v7, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v13, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v10, v9 :: v_dual_and_b32 v4, 0xffff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v11 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_and_b32 v12, 0xffff, v13
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v5, v6, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
+; GFX11-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v7, v12, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v5
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v2, v12, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX11-NEXT:    v_perm_b32 v1, v2, v5, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v2, v6, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v3, v7, v4, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
   ret <8 x bfloat> %op
@@ -27727,110 +26372,103 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v8
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v0, v20, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v0, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v0, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v11
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v0, v19, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v18, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
-; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v11
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
-; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
-; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
-; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v15
-; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v16
-; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v20
-; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v8
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v0, v20, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v0, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v0, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v0, v19, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v18, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v11
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
-; GFX9-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
-; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v15
-; GFX9-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v16
-; GFX9-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v20
-; GFX9-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
+; GFX9-NEXT:    v_perm_b32 v4, v5, v4, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
+; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v16bf16:
@@ -27839,53 +26477,45 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v0, v27, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v18, v17, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v19, v0, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
+; GFX10-NEXT:    v_perm_b32 v0, v9, v1, 0x5040100
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_perm_b32 v1, v10, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v18, v17, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v22, v21, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v18, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v20, v19, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v28, v29, v28, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v17, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v31, v30, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v26, v25, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v24, v23, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v28
-; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v4, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v5, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v6, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v7, v8, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v14, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v9, v5, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v5, v6, v10, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v11, v7, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v7, v12, v8, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v16bf16:
@@ -27894,55 +26524,45 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v0, v27 :: v_dual_cndmask_b32 v1, v9, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v9, v18, v17, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v14, v6 :: v_dual_cndmask_b32 v3, v11, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v10, v2 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v20, v19 :: v_dual_lshlrev_b32 v9, 16, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v22, v21, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v28, v29, v28 :: v_dual_cndmask_b32 v15, v31, v30
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v26, v25 :: v_dual_and_b32 v3, 0xffff, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v13, v5 :: v_dual_and_b32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v12, v24, v23 :: v_dual_lshlrev_b32 v11, 16, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v10
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v19, v0, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
+; GFX11-NEXT:    v_perm_b32 v0, v9, v1, 0x5040100
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX11-NEXT:    v_perm_b32 v1, v10, v2, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v11
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v28
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v9
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v10
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v11
-; GFX11-NEXT:    v_or_b32_e32 v7, v8, v12
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v18, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v3, v12, v4 :: v_dual_cndmask_b32 v4, v10, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_cndmask_b32 v9, v17, v11 :: v_dual_cndmask_b32 v10, v14, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v15, v7 :: v_dual_cndmask_b32 v8, v16, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v17, v14, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v4, v9, v5, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v5, v6, v10, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v6, v11, v7, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v7, v12, v8, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
   ret <16 x bfloat> %op
@@ -28360,106 +26980,106 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
-; GFX8-NEXT:    v_cndmask_b32_e32 v30, v30, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v28
-; GFX8-NEXT:    v_cndmask_b32_e32 v29, v29, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
-; GFX8-NEXT:    v_cndmask_b32_e32 v28, v28, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
-; GFX8-NEXT:    v_cndmask_b32_e32 v27, v27, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
-; GFX8-NEXT:    v_cndmask_b32_e32 v26, v26, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v25, v25, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v24, v24, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v0, vcc
-; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v17, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v17, v4, vcc
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v18, v5, vcc
+; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v19, v6, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v19, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v19, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v19, v10, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v19, v12, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v30
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v19, v14, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v0, v16, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v33, v0, v33, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v32, v15, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v32, v0, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v18, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
-; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v21
-; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v22
-; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v23
-; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v24
-; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v25
-; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v26
-; GFX8-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v27
-; GFX8-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v28
-; GFX8-NEXT:    v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v29
-; GFX8-NEXT:    v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v30
-; GFX8-NEXT:    v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v31
-; GFX8-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v32
-; GFX8-NEXT:    v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v33
-; GFX8-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v32bf16:
@@ -28467,106 +27087,91 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
-; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v30, v30, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v28
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v29, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
-; GFX9-NEXT:    v_cndmask_b32_e32 v28, v28, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v27, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v26, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v25, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v24, v24, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; GFX9-NEXT:    v_cndmask_b32_e32 v23, v23, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
-; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v22, v0, vcc
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v17, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v17, v4, vcc
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v21
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v18, v5, vcc
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_perm_b32 v4, v5, v4, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v22
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v19, v6, vcc
+; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
+; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v24
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v19, v8, vcc
+; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v19, v9, vcc
+; GFX9-NEXT:    v_perm_b32 v8, v9, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v19, v10, vcc
+; GFX9-NEXT:    v_perm_b32 v9, v10, v9, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v27
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
+; GFX9-NEXT:    v_perm_b32 v10, v11, v10, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v28
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v19, v12, vcc
+; GFX9-NEXT:    v_perm_b32 v11, v12, v11, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
+; GFX9-NEXT:    v_perm_b32 v12, v13, v12, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v30
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v19, v14, vcc
+; GFX9-NEXT:    v_perm_b32 v13, v14, v13, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v0, v16, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v33, v0, v33, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc
+; GFX9-NEXT:    v_perm_b32 v14, v15, v14, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v32, v15, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v32, v32, v0, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v21, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v20, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v19, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v18, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v21
-; GFX9-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v22
-; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v23
-; GFX9-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v24
-; GFX9-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v25
-; GFX9-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v26
-; GFX9-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v27
-; GFX9-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v28
-; GFX9-NEXT:    v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v29
-; GFX9-NEXT:    v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v30
-; GFX9-NEXT:    v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v31
-; GFX9-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v32
-; GFX9-NEXT:    v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v33
-; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX9-NEXT:    v_perm_b32 v15, v16, v15, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v32bf16:
@@ -28576,19 +27181,10 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v29
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v30
-; GFX10-NEXT:    v_cndmask_b32_e32 v67, v68, v67, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v27
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
@@ -28599,96 +27195,103 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
-; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v64, v65, v64, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
-; GFX10-NEXT:    v_cndmask_b32_e32 v29, v29, v68, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v25, v34, v33, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v36, v35, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v30, v30, v65, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v55, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v54, v53, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v52, v51, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v50, v49, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v48, v39, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v38, v37, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v24
+; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
+; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v11
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v33
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v28, v28, v65, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v27
+; GFX10-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v49, v50, v49, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v52, v51, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v54, v53, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v28
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v30
+; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v24, v64, v55, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v25, v0, v65, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v68, v66, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v64
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v4, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v67
-; GFX10-NEXT:    v_or_b32_sdwa v5, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v6, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v7, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v8, v9, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v9, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v10, v11, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v11, v12, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v12, v13, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v13, v14, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v26, v67, v66, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v68, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v33, v1, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v35, v2, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v37, v3, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v39, v4, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v49, v5, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v5, v22, v6, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v23, v7, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, v18, v34, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v19, v36, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v38, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v7, v24, v8, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v8, v25, v9, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v9, v26, v10, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v10, v17, v11, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v11, v18, v12, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v12, v19, v13, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v13, v20, v14, 0x5040100
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v31
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v31
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v32
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v32
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v65, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v18, v27, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX10-NEXT:    v_or_b32_sdwa v14, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v15, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v17, v22, v48, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v14, v17, v15, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v15, v21, v16, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v27
+; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v28
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v29
+; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, v34, v33, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v18, v36, v35, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
@@ -28704,92 +27307,46 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v25
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v10
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v16
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v30
+; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_dual_cndmask_b32 v10, v26, v10 :: v_dual_cndmask_b32 v11, v27, v11
+; GFX11-NEXT:    v_dual_cndmask_b32 v27, v70, v69 :: v_dual_cndmask_b32 v12, v28, v12
+; GFX11-NEXT:    v_dual_cndmask_b32 v28, v80, v71 :: v_dual_cndmask_b32 v13, v29, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v29, v82, v81, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v30, v82, v81 :: v_dual_cndmask_b32 v11, v27, v11
-; GFX11-NEXT:    v_dual_cndmask_b32 v12, v28, v12 :: v_dual_cndmask_b32 v7, v23, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v28, v70, v69 :: v_dual_cndmask_b32 v27, v68, v67
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v26, v66, v65 :: v_dual_cndmask_b32 v23, v52, v51
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v24, v8 :: v_dual_cndmask_b32 v5, v21, v5
-; GFX11-NEXT:    v_dual_cndmask_b32 v24, v54, v53 :: v_dual_cndmask_b32 v21, v48, v39
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v22, v6 :: v_dual_cndmask_b32 v1, v17, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v22, v50, v49 :: v_dual_cndmask_b32 v3, v19, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v20, v4 :: v_dual_cndmask_b32 v17, v34, v33
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v38, v37, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v18, v36, v35 :: v_dual_lshlrev_b32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_dual_cndmask_b32 v13, v29, v13 :: v_dual_and_b32 v12, 0xffff, v12
-; GFX11-NEXT:    v_dual_cndmask_b32 v29, v80, v71 :: v_dual_and_b32 v14, 0xffff, v14
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v25, v64, v55 :: v_dual_and_b32 v10, 0xffff, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
+; GFX11-NEXT:    v_cndmask_b32_e32 v30, v0, v83, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v17, v1, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v18, v2, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v19, v38, v37 :: v_dual_cndmask_b32 v4, v20, v4
+; GFX11-NEXT:    v_dual_cndmask_b32 v20, v48, v39 :: v_dual_cndmask_b32 v5, v21, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, v50, v49, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v22, v6 :: v_dual_cndmask_b32 v7, v23, v7
+; GFX11-NEXT:    v_dual_cndmask_b32 v22, v52, v51 :: v_dual_cndmask_b32 v23, v54, v53
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v24, v64, v55 :: v_dual_cndmask_b32 v9, v25, v9
+; GFX11-NEXT:    v_dual_cndmask_b32 v25, v66, v65 :: v_dual_cndmask_b32 v26, v68, v67
+; GFX11-NEXT:    v_perm_b32 v2, v19, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v3, v20, v4, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v4, v21, v5, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v5, v22, v6, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v6, v23, v7, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v7, v24, v8, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v8, v25, v9, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v9, v26, v10, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v10, v27, v11, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v11, v28, v12, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v12, v29, v13, 0x5040100
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v31
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v31
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v31, v16 :: v_dual_cndmask_b32 v15, v32, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v31, v87, v86 :: v_dual_cndmask_b32 v84, v85, v84
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v83, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v84
-; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v17
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v18
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v19
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v20
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v21
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v22
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v23
-; GFX11-NEXT:    v_or_b32_e32 v7, v8, v24
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v25
-; GFX11-NEXT:    v_or_b32_e32 v9, v10, v26
-; GFX11-NEXT:    v_or_b32_e32 v10, v11, v27
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v28
-; GFX11-NEXT:    v_or_b32_e32 v12, v13, v29
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v30
-; GFX11-NEXT:    v_or_b32_e32 v14, v15, v31
-; GFX11-NEXT:    v_or_b32_e32 v15, v16, v32
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v32
+; GFX11-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v16, v32, v16
+; GFX11-NEXT:    v_perm_b32 v13, v30, v14, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v17, v17, v84 :: v_dual_cndmask_b32 v18, v18, v85
+; GFX11-NEXT:    v_perm_b32 v14, v17, v15, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v15, v18, v16, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
   ret <32 x bfloat> %op
@@ -28875,9 +27432,9 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -28889,19 +27446,18 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 ; GFX10-LABEL: s_select_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s4, v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s1
+; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, s2, v2, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s3, v0, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_select_v3bf16:
@@ -28909,20 +27465,19 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 ; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s5, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, s2, v2 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, s2, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v0, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %cond = icmp eq i32 %c, 0
   %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
@@ -29002,21 +27557,21 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ;
 ; GFX8-LABEL: s_select_v4bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s2, 16
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
@@ -29029,76 +27584,68 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ;
 ; GFX9-LABEL: s_select_v4bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s7, s3, 16
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    s_mov_b32 s1, 0x5040100
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_select_v4bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX10-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX10-NEXT:    v_mov_b32_e32 v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s6, v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, s4, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, s2, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX10-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, s0, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, s3, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_select_v4bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX11-NEXT:    s_lshr_b32 s6, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s6
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s1
-; GFX11-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX11-NEXT:    s_lshr_b32 s5, s3, 16
 ; GFX11-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, s6, v1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, s3, v3 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, s3, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
@@ -29174,22 +27721,22 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ;
 ; GFX8-LABEL: s_vselect_v4bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
-; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
-; GFX8-NEXT:    v_mov_b32_e32 v4, s7
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s2, 16
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s2
@@ -29204,30 +27751,29 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ;
 ; GFX9-LABEL: s_vselect_v4bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s7, s3, 16
-; GFX9-NEXT:    v_mov_b32_e32 v4, s7
-; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX9-NEXT:    s_lshr_b32 s5, s3, 16
+; GFX9-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_mov_b32 s1, 0x5040100
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -29237,55 +27783,50 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX10-NEXT:    v_mov_b32_e32 v5, s5
-; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, s5, v4, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v6, s0
+; GFX10-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, s4, v4, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s4, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, s3, v5, vcc_lo
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_vselect_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX11-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, s6, v4, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX11-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, s4, v4, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, s4, v5, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v6, s0
+; GFX11-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s0, v4, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s2, v4, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s2, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, s3, v6, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, s3, v5, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %cond = icmp eq <4 x i32> %c, zeroinitializer
@@ -29346,107 +27887,98 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
 ; GFX8-LABEL: v_vselect_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_vselect_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v7, v5 :: v_dual_and_b32 v3, 1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v3, 1, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
   ret <4 x bfloat> %op
@@ -29530,187 +28062,173 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
 ; GFX8-LABEL: v_vselect_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
-; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v22, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v20, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v18, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v12, v8, vcc
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v13, v9, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v13
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v14, v10, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v15, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v15
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
-; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v22, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v20, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v18, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v8, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v13
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v14, v10, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v15, v11, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v15
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v23, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v15
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_vselect_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v12, v8 :: v_dual_and_b32 v7, 1, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v13
 ; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v23, v22 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v17, v16 :: v_dual_and_b32 v6, 1, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v13, v9 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v18, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v12, v8 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v12, v8, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v5, 1, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v21, v20, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v9, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b
   ret <8 x bfloat> %op
@@ -29951,358 +28469,326 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX8-LABEL: v_vselect_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
-; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v32, v31, vcc
-; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX8-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX8-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v30, v22, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
-; GFX8-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v30, v12, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
-; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v16, v1, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v25, v17, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v26
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v26, v18, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v7
+; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v27
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[4:5]
 ; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v29, v21, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v30, v11, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v27, v19, vcc
+; GFX8-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v19
-; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v27
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v28, v20, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v29, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v26
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, 1, v11
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v28, v20, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v28
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v29, v21, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v29
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX8-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v6, 1, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX8-NEXT:    v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v30, v22, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v30
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX8-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v30, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v17
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v24
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v31, v23, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v31
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v7, v23, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v25
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v10, v28, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX8-NEXT:    v_or_b32_sdwa v4, v20, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v5, v21, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v6, v22, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v32, v31, vcc
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX9-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v21
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v30, v22, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
-; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
-; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v30, v12, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v16, v1, s[4:5]
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v25, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s6
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v26, v18, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[4:5]
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s6
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v7
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v27
 ; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v29, v21, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v30, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v27, v19, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v27
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v28, v20, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v29, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v26
-; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc
+; GFX9-NEXT:    v_and_b32_e32 v5, 1, v11
+; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v28, v20, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v28
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX9-NEXT:    v_perm_b32 v4, v8, v4, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v29, v21, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v29
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
 ; GFX9-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v30, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v30, v22, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v30
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX9-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v17
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v24
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v31, v23, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v31
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v23, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v25
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v28, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; GFX9-NEXT:    v_or_b32_sdwa v4, v20, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v5, v21, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v6, v22, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s6
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
-; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
-; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v53, v52, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v28
-; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v51, v50, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v49, v48, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
-; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v39, v38, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v37, v36, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v35, v34, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v3, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v4, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v5, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v6, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v31
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v54, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v28
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v35, v34, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v37, v36, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX10-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v39, v38, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v49, v48, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX10-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v51, v50, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX10-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v53, v52, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v31, v23, vcc_lo
-; GFX10-NEXT:    v_or_b32_sdwa v7, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v3, v54, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_vselect_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
+; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
 ; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v24, v16 :: v_dual_and_b32 v3, 1, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
+; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX11-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v53, v52, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v51, v50, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v35, v34, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
 ; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v26, v18 :: v_dual_and_b32 v7, 1, v7
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v49, v48, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v28, v20 :: v_dual_and_b32 v15, 1, v15
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v39, v38 :: v_dual_and_b32 v8, 0xffff, v8
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v37, v36, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v37, v36 :: v_dual_and_b32 v6, 0xffff, v6
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v35, v34, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v39, v38 :: v_dual_and_b32 v8, 1, v8
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v49, v48, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v51, v50, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v53, v52, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, v31, v23, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v31
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v16, v54, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v4, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v6, v12, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v7, v14, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v3, v54, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b
   ret <16 x bfloat> %op
@@ -30760,463 +29246,571 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-LABEL: v_vselect_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v22
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v23
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v22
-; GFX8-NEXT:    buffer_load_ushort v22, off, s[0:3], s32
-; GFX8-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v28
-; GFX8-NEXT:    v_and_b32_e32 v28, 1, v29
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v24
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v25
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v28
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v24
-; GFX8-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v26
-; GFX8-NEXT:    v_and_b32_e32 v26, 1, v27
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v26
-; GFX8-NEXT:    v_and_b32_e32 v30, 1, v30
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v30
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX8-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX8-NEXT:    v_writelane_b32 v31, s36, 4
+; GFX8-NEXT:    v_writelane_b32 v31, s37, 5
 ; GFX8-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX8-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX8-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GFX8-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX8-NEXT:    v_writelane_b32 v31, s38, 6
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v21
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v18
+; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
+; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4
 ; GFX8-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GFX8-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX8-NEXT:    v_writelane_b32 v31, s39, 7
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v17
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v16
+; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:72
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
 ; GFX8-NEXT:    v_and_b32_e32 v15, 1, v15
 ; GFX8-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX8-NEXT:    v_writelane_b32 v31, s40, 8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v15
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v14
+; GFX8-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:76
+; GFX8-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:12
+; GFX8-NEXT:    v_writelane_b32 v31, s41, 9
+; GFX8-NEXT:    v_writelane_b32 v31, s42, 10
 ; GFX8-NEXT:    v_and_b32_e32 v13, 1, v13
 ; GFX8-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX8-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX8-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX8-NEXT:    v_writelane_b32 v31, s43, 11
+; GFX8-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v13
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v12
+; GFX8-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:80
+; GFX8-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:16
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v20
+; GFX8-NEXT:    buffer_load_ushort v20, off, s[0:3], s32
+; GFX8-NEXT:    v_writelane_b32 v31, s44, 12
+; GFX8-NEXT:    v_writelane_b32 v31, s45, 13
+; GFX8-NEXT:    v_writelane_b32 v31, s46, 14
+; GFX8-NEXT:    v_writelane_b32 v31, s47, 15
+; GFX8-NEXT:    v_writelane_b32 v31, s48, 16
+; GFX8-NEXT:    v_writelane_b32 v31, s49, 17
+; GFX8-NEXT:    v_writelane_b32 v31, s50, 18
+; GFX8-NEXT:    v_writelane_b32 v31, s51, 19
+; GFX8-NEXT:    v_writelane_b32 v31, s52, 20
+; GFX8-NEXT:    v_writelane_b32 v31, s53, 21
+; GFX8-NEXT:    v_writelane_b32 v31, s54, 22
+; GFX8-NEXT:    v_writelane_b32 v31, s55, 23
+; GFX8-NEXT:    v_writelane_b32 v31, s56, 24
+; GFX8-NEXT:    v_writelane_b32 v31, s57, 25
+; GFX8-NEXT:    v_writelane_b32 v31, s58, 26
+; GFX8-NEXT:    v_writelane_b32 v31, s59, 27
+; GFX8-NEXT:    v_writelane_b32 v31, s60, 28
+; GFX8-NEXT:    v_writelane_b32 v31, s61, 29
+; GFX8-NEXT:    v_writelane_b32 v31, s62, 30
+; GFX8-NEXT:    v_writelane_b32 v31, s63, 31
+; GFX8-NEXT:    v_writelane_b32 v31, s64, 32
 ; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
 ; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_writelane_b32 v31, s65, 33
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v7
+; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:84
+; GFX8-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
 ; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_writelane_b32 v31, s66, 34
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_writelane_b32 v31, s67, 35
+; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[66:67], 1, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v4
+; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:88
+; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX8-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX8-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v10
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v9
+; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:92
+; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
+; GFX8-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX8-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX8-NEXT:    v_and_b32_e32 v23, 1, v23
 ; GFX8-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v22
-; GFX8-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
-; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:124
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e64 v22, v23, v22, s[10:11]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; GFX8-NEXT:    v_cndmask_b32_e64 v23, v23, v24, s[12:13]
-; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:56
-; GFX8-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:120
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v26, 16, v24
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e64 v24, v25, v24, s[14:15]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; GFX8-NEXT:    v_cndmask_b32_e64 v25, v25, v26, s[16:17]
-; GFX8-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:52
-; GFX8-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:116
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v26
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e64 v26, v27, v26, s[18:19]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; GFX8-NEXT:    v_cndmask_b32_e64 v27, v27, v28, s[20:21]
-; GFX8-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:48
-; GFX8-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e64 v28, v29, v28, s[6:7]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; GFX8-NEXT:    v_cndmask_b32_e64 v30, v29, v30, s[8:9]
-; GFX8-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:64
-; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v29
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v29, v31, v29, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX8-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[4:5]
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v21
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v34, v21, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v33, v32, vcc
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v34, v19, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v33, v32, vcc
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, v34, v17, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v33, v32, vcc
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v34, v15, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v33, v32, vcc
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v34, v13, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v33, v32, vcc
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v34, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v33, v32, vcc
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v34, v9, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v33, v32, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v7
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v32
-; GFX8-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v32, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
-; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v32, v34, v32, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
-; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72
+; GFX8-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX8-NEXT:    s_waitcnt vmcnt(13)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[64:65]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v18, v21, s[66:67]
+; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(13)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
+; GFX8-NEXT:    s_waitcnt vmcnt(12)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[60:61]
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v16, v17, s[62:63]
+; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:40
+; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
+; GFX8-NEXT:    s_waitcnt vmcnt(13)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[56:57]
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v14, v15, s[58:59]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(11)
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v12, v13, s[54:55]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v11
+; GFX8-NEXT:    s_waitcnt vmcnt(10)
+; GFX8-NEXT:    v_and_b32_e32 v11, 1, v20
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[52:53]
+; GFX8-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
+; GFX8-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:116
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v25
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v24
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v23
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v22
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v19
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v11
+; GFX8-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:112
+; GFX8-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:108
+; GFX8-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:96
+; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:104
+; GFX8-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:100
+; GFX8-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:120
+; GFX8-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:124
+; GFX8-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX8-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX8-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v26
+; GFX8-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v28
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v27
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v29
+; GFX8-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v7, v8, s[50:51]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[48:49]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v30
+; GFX8-NEXT:    v_readlane_b32 s67, v31, 35
+; GFX8-NEXT:    v_readlane_b32 s66, v31, 34
+; GFX8-NEXT:    v_readlane_b32 s65, v31, 33
+; GFX8-NEXT:    v_readlane_b32 s64, v31, 32
+; GFX8-NEXT:    v_readlane_b32 s63, v31, 31
+; GFX8-NEXT:    v_readlane_b32 s62, v31, 30
+; GFX8-NEXT:    v_readlane_b32 s61, v31, 29
+; GFX8-NEXT:    v_readlane_b32 s60, v31, 28
+; GFX8-NEXT:    v_readlane_b32 s59, v31, 27
+; GFX8-NEXT:    v_readlane_b32 s58, v31, 26
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v5, v6, s[46:47]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[44:45]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v9, v10, s[42:43]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[40:41]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_readlane_b32 s57, v31, 25
+; GFX8-NEXT:    v_readlane_b32 s56, v31, 24
+; GFX8-NEXT:    v_readlane_b32 s55, v31, 23
+; GFX8-NEXT:    v_readlane_b32 s54, v31, 22
+; GFX8-NEXT:    v_readlane_b32 s53, v31, 21
+; GFX8-NEXT:    v_readlane_b32 s52, v31, 20
+; GFX8-NEXT:    v_readlane_b32 s51, v31, 19
+; GFX8-NEXT:    v_readlane_b32 s50, v31, 18
+; GFX8-NEXT:    v_readlane_b32 s49, v31, 17
+; GFX8-NEXT:    v_readlane_b32 s48, v31, 16
+; GFX8-NEXT:    v_readlane_b32 s47, v31, 15
+; GFX8-NEXT:    v_readlane_b32 s46, v31, 14
+; GFX8-NEXT:    v_readlane_b32 s45, v31, 13
+; GFX8-NEXT:    v_readlane_b32 s44, v31, 12
+; GFX8-NEXT:    v_readlane_b32 s43, v31, 11
+; GFX8-NEXT:    v_readlane_b32 s42, v31, 10
+; GFX8-NEXT:    v_readlane_b32 s41, v31, 9
+; GFX8-NEXT:    v_readlane_b32 s40, v31, 8
+; GFX8-NEXT:    s_waitcnt vmcnt(6)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
+; GFX8-NEXT:    s_waitcnt vmcnt(5)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v9, v8, s[36:37]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v25
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v22, v23, s[38:39]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v10, v9, s[30:31]
+; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v25, v18, s[34:35]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v24, v16, s[28:29]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v16, v10, s[26:27]
+; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v11, v21, s[24:25]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v21
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v16, s[22:23]
+; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v19, v20, s[20:21]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v19, v20, s[16:17]
+; GFX8-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_or_b32_sdwa v11, v11, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_readlane_b32 s39, v31, 7
+; GFX8-NEXT:    v_readlane_b32 s38, v31, 6
+; GFX8-NEXT:    v_readlane_b32 s37, v31, 5
+; GFX8-NEXT:    v_readlane_b32 s36, v31, 4
+; GFX8-NEXT:    v_readlane_b32 s35, v31, 3
+; GFX8-NEXT:    v_readlane_b32 s34, v31, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v31, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v12, v18, s[14:15]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v12, v18, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, v13, v17, s[10:11]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v17, s[8:9]
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v14, v16, s[6:7]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v16, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_or_b32_sdwa v14, v17, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v34, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v15, v20, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v15, v17, s[18:19]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_or_b32_sdwa v12, v19, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v33
-; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
-; GFX8-NEXT:    v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v31
-; GFX8-NEXT:    v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v12, v26, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v13, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v14, v22, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v22
-; GFX9-NEXT:    v_and_b32_e32 v22, 1, v23
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v22
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX9-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v28
-; GFX9-NEXT:    v_and_b32_e32 v28, 1, v29
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v24
-; GFX9-NEXT:    v_and_b32_e32 v24, 1, v25
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v28
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v24
-; GFX9-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v26
-; GFX9-NEXT:    v_and_b32_e32 v26, 1, v27
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v26
-; GFX9-NEXT:    v_and_b32_e32 v30, 1, v30
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v30
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v31, s36, 4
+; GFX9-NEXT:    v_writelane_b32 v31, s37, 5
+; GFX9-NEXT:    v_writelane_b32 v31, s38, 6
+; GFX9-NEXT:    v_writelane_b32 v31, s39, 7
+; GFX9-NEXT:    v_writelane_b32 v31, s40, 8
+; GFX9-NEXT:    v_writelane_b32 v31, s41, 9
+; GFX9-NEXT:    v_writelane_b32 v31, s42, 10
+; GFX9-NEXT:    v_writelane_b32 v31, s43, 11
+; GFX9-NEXT:    v_writelane_b32 v31, s44, 12
+; GFX9-NEXT:    v_writelane_b32 v31, s45, 13
+; GFX9-NEXT:    v_writelane_b32 v31, s46, 14
+; GFX9-NEXT:    v_writelane_b32 v31, s47, 15
+; GFX9-NEXT:    v_writelane_b32 v31, s48, 16
+; GFX9-NEXT:    v_writelane_b32 v31, s49, 17
+; GFX9-NEXT:    v_writelane_b32 v31, s50, 18
+; GFX9-NEXT:    v_writelane_b32 v31, s51, 19
 ; GFX9-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX9-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX9-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GFX9-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX9-NEXT:    v_writelane_b32 v31, s52, 20
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v21
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v18
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GFX9-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX9-NEXT:    v_writelane_b32 v31, s53, 21
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v17
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v16
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_writelane_b32 v31, s54, 22
 ; GFX9-NEXT:    v_and_b32_e32 v15, 1, v15
 ; GFX9-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX9-NEXT:    v_writelane_b32 v31, s55, 23
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v15
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v14
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    v_writelane_b32 v31, s56, 24
 ; GFX9-NEXT:    v_and_b32_e32 v13, 1, v13
 ; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX9-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_writelane_b32 v31, s57, 25
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v13
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v12
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    v_writelane_b32 v31, s58, 26
 ; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_writelane_b32 v31, s59, 27
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v4
+; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v20
+; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32
+; GFX9-NEXT:    v_writelane_b32 v31, s60, 28
+; GFX9-NEXT:    v_writelane_b32 v31, s61, 29
+; GFX9-NEXT:    v_writelane_b32 v31, s62, 30
+; GFX9-NEXT:    v_writelane_b32 v31, s63, 31
+; GFX9-NEXT:    v_writelane_b32 v31, s64, 32
+; GFX9-NEXT:    v_writelane_b32 v31, s65, 33
+; GFX9-NEXT:    v_writelane_b32 v31, s66, 34
 ; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_writelane_b32 v31, s67, 35
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[66:67], 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v6
+; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v7
+; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v9
+; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:88
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX9-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v24
+; GFX9-NEXT:    v_and_b32_e32 v23, 1, v23
 ; GFX9-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v22
-; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v22, v23, v22, s[10:11]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; GFX9-NEXT:    v_cndmask_b32_e64 v23, v23, v24, s[12:13]
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:120
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v24
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v24, v25, v24, s[14:15]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; GFX9-NEXT:    v_cndmask_b32_e64 v25, v25, v26, s[16:17]
-; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v26
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v26, v27, v26, s[18:19]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; GFX9-NEXT:    v_cndmask_b32_e64 v27, v27, v28, s[20:21]
-; GFX9-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v28, v29, v28, s[6:7]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; GFX9-NEXT:    v_cndmask_b32_e64 v30, v29, v30, s[8:9]
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v29
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v31, v29, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX9-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[4:5]
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v21
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v34, v21, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v20
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v32, vcc
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v34, v19, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v32, vcc
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v17
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v34, v17, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v33, v32, vcc
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v34, v15, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v33, v32, vcc
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v13
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v34, v13, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v33, v32, vcc
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v34, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v33, v32, vcc
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v34, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v33, v32, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v7
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v32, v7, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v11
+; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v23
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v22
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v19
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v10
+; GFX9-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX9-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX9-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX9-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v26
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v25
+; GFX9-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v28
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v27
+; GFX9-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v18, v21, s[66:67]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[64:65]
+; GFX9-NEXT:    s_mov_b32 s64, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s64
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v16, v17, s[62:63]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[60:61]
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s64
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v14, v15, s[58:59]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v14, v3, s[56:57]
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s64
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v12, v13, s[54:55]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[52:53]
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:120
+; GFX9-NEXT:    v_perm_b32 v3, v12, v3, s64
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v29
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v4, v5, s[50:51]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[48:49]
+; GFX9-NEXT:    v_perm_b32 v4, v4, v12, s64
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_and_b32_e32 v11, 1, v20
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v11
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:112
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:104
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:96
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v30
+; GFX9-NEXT:    v_readlane_b32 s67, v31, 35
+; GFX9-NEXT:    v_readlane_b32 s66, v31, 34
+; GFX9-NEXT:    v_readlane_b32 s65, v31, 33
+; GFX9-NEXT:    v_readlane_b32 s63, v31, 31
+; GFX9-NEXT:    v_readlane_b32 s62, v31, 30
+; GFX9-NEXT:    v_readlane_b32 s61, v31, 29
+; GFX9-NEXT:    v_readlane_b32 s60, v31, 28
+; GFX9-NEXT:    v_readlane_b32 s59, v31, 27
+; GFX9-NEXT:    v_readlane_b32 s58, v31, 26
+; GFX9-NEXT:    v_readlane_b32 s57, v31, 25
+; GFX9-NEXT:    v_readlane_b32 s56, v31, 24
+; GFX9-NEXT:    v_readlane_b32 s55, v31, 23
+; GFX9-NEXT:    v_readlane_b32 s54, v31, 22
+; GFX9-NEXT:    v_readlane_b32 s53, v31, 21
+; GFX9-NEXT:    v_readlane_b32 s52, v31, 20
+; GFX9-NEXT:    v_readlane_b32 s51, v31, 19
+; GFX9-NEXT:    v_readlane_b32 s50, v31, 18
+; GFX9-NEXT:    v_readlane_b32 s49, v31, 17
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v6, v7, s[46:47]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[44:45]
+; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s64
+; GFX9-NEXT:    v_readlane_b32 s48, v31, 16
+; GFX9-NEXT:    v_readlane_b32 s47, v31, 15
+; GFX9-NEXT:    v_readlane_b32 s46, v31, 14
+; GFX9-NEXT:    v_readlane_b32 s45, v31, 13
+; GFX9-NEXT:    v_readlane_b32 s44, v31, 12
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v9, s[42:43]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[40:41]
+; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s64
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v32, v34, v32, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v22, v23, s[38:39]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v9, v8, s[36:37]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s64
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v20, v18, s[34:35]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[30:31]
+; GFX9-NEXT:    v_perm_b32 v8, v9, v8, s64
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v11, v16, s[28:29]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v16, s[26:27]
+; GFX9-NEXT:    v_perm_b32 v9, v11, v9, s64
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v10, v21, s[24:25]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s[22:23]
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    v_perm_b32 v10, v10, v11, s64
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v19, v24, s[20:21]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v24
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v19, v20, s[16:17]
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    v_perm_b32 v11, v19, v11, s64
+; GFX9-NEXT:    v_readlane_b32 s43, v31, 11
+; GFX9-NEXT:    v_readlane_b32 s42, v31, 10
+; GFX9-NEXT:    v_readlane_b32 s41, v31, 9
+; GFX9-NEXT:    v_readlane_b32 s40, v31, 8
+; GFX9-NEXT:    v_readlane_b32 s39, v31, 7
+; GFX9-NEXT:    v_readlane_b32 s38, v31, 6
+; GFX9-NEXT:    v_readlane_b32 s37, v31, 5
+; GFX9-NEXT:    v_readlane_b32 s36, v31, 4
+; GFX9-NEXT:    v_readlane_b32 s35, v31, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v31, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v31, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v12, v18, s[14:15]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v18, s[12:13]
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, v13, v17, s[10:11]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v17, s[8:9]
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v14, v16, s[6:7]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v16, s[4:5]
+; GFX9-NEXT:    v_perm_b32 v14, v14, v17, s64
+; GFX9-NEXT:    v_perm_b32 v12, v12, v19, s64
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v34, v7, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v15, v20, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v15, v17, s[18:19]
+; GFX9-NEXT:    v_perm_b32 v13, v13, v18, s64
+; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s64
+; GFX9-NEXT:    v_readlane_b32 s64, v31, 32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v34, v5, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v33
-; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
-; GFX9-NEXT:    v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v31
-; GFX9-NEXT:    v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v11, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v12, v26, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v13, v24, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v14, v22, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v32bf16:
@@ -31226,233 +29820,205 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX10-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX10-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX10-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v6
 ; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
 ; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
 ; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX10-NEXT:    v_and_b32_e32 v16, 1, v16
 ; GFX10-NEXT:    s_clause 0x15
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
 ; GFX10-NEXT:    buffer_load_ushort v36, off, s[0:3], s32
-; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:64
-; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:128
-; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:68
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
-; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:60
-; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:124
-; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:8
-; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:72
-; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
-; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:76
-; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:16
-; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:80
-; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:20
-; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:84
-; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:24
-; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:88
-; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:28
-; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:92
-; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:32
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 1, v20
-; GFX10-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
-; GFX10-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:36
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v24
-; GFX10-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 1, v30
-; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:40
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s7, 1, v0
-; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:104
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s8, 1, v2
-; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:44
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s9, 1, v4
-; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:108
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s10, 1, v6
-; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:48
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s11, 1, v8
-; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:120
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s12, 1, v10
-; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:116
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s13, 1, v12
-; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v0
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 1, v2
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s7, 1, v4
+; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s8, 1, v3
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s9, 1, v8
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:108
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s10, 1, v10
+; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s11, 1, v12
+; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s12, 1, v14
+; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:128
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s13, 1, v16
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:44
 ; GFX10-NEXT:    v_writelane_b32 v31, s30, 0
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX10-NEXT:    v_and_b32_e32 v16, 1, v16
-; GFX10-NEXT:    v_and_b32_e32 v18, 1, v18
-; GFX10-NEXT:    v_writelane_b32 v31, s31, 1
-; GFX10-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX10-NEXT:    v_and_b32_e32 v30, 1, v30
 ; GFX10-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT:    v_writelane_b32 v31, s34, 2
-; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX10-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX10-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX10-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX10-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX10-NEXT:    v_and_b32_e32 v20, 1, v20
 ; GFX10-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX10-NEXT:    v_and_b32_e32 v19, 1, v19
-; GFX10-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX10-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX10-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX10-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX10-NEXT:    v_writelane_b32 v31, s34, 2
 ; GFX10-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s14, 1, v14
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s15, 1, v16
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s16, 1, v18
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s17, 1, v28
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s19, 1, v26
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s31, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX10-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX10-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX10-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX10-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX10-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s15, 1, v20
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s16, 1, v22
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s17, 1, v24
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s18, 1, v26
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s19, 1, v28
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s20, 1, v30
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s22, 1, v7
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s23, 1, v9
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s27, 1, v17
 ; GFX10-NEXT:    v_writelane_b32 v31, s35, 3
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s18, 1, v29
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s20, 1, v27
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s21, 1, v25
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s22, 1, v23
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s23, 1, v21
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s24, 1, v19
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s25, 1, v17
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s14, 1, v18
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s21, 1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s24, 1, v11
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s25, 1, v13
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s26, 1, v15
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s27, 1, v13
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s28, 1, v11
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s29, 1, v7
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s30, 1, v3
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s34, 1, v5
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s35, 1, v9
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s28, 1, v19
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s29, 1, v21
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s30, 1, v23
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s31, 1, v25
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s34, 1, v27
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s35, 1, v29
 ; GFX10-NEXT:    s_waitcnt vmcnt(32)
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v36
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v32
 ; GFX10-NEXT:    s_waitcnt vmcnt(31)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v37
-; GFX10-NEXT:    s_waitcnt vmcnt(30)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v38
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v38, v37, s6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v33
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v32, v33, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(29)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v39
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v34, v35, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(28)
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v48, v39, s7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v48
-; GFX10-NEXT:    s_waitcnt vmcnt(23)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v49
+; GFX10-NEXT:    v_and_b32_e32 v17, 1, v36
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v35
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v34
+; GFX10-NEXT:    s_waitcnt vmcnt(26)
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v37, v38, s7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v38
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v37
+; GFX10-NEXT:    s_waitcnt vmcnt(24)
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v39, v48, s8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v48
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v39
 ; GFX10-NEXT:    s_waitcnt vmcnt(22)
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, v50, v49, s8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v50
-; GFX10-NEXT:    s_waitcnt vmcnt(21)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e64 v24, v50, v49, s9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v49
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v50
 ; GFX10-NEXT:    s_waitcnt vmcnt(20)
-; GFX10-NEXT:    v_cndmask_b32_e64 v18, v52, v51, s9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v52
-; GFX10-NEXT:    s_waitcnt vmcnt(19)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v53
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, v51, v52, s10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v52
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v51
 ; GFX10-NEXT:    s_waitcnt vmcnt(18)
-; GFX10-NEXT:    v_cndmask_b32_e64 v23, v54, v53, s10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v54
-; GFX10-NEXT:    s_waitcnt vmcnt(17)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v55
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, v53, v54, s11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v54
+; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v53
 ; GFX10-NEXT:    s_waitcnt vmcnt(16)
-; GFX10-NEXT:    v_cndmask_b32_e64 v27, v64, v55, s11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v64
-; GFX10-NEXT:    s_waitcnt vmcnt(15)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v65
-; GFX10-NEXT:    s_waitcnt vmcnt(14)
-; GFX10-NEXT:    v_cndmask_b32_e64 v36, v66, v65, s12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v66
-; GFX10-NEXT:    s_waitcnt vmcnt(13)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v67
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v55, v64, s12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v64
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v55
 ; GFX10-NEXT:    s_waitcnt vmcnt(12)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v68
-; GFX10-NEXT:    s_waitcnt vmcnt(11)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v69
-; GFX10-NEXT:    s_waitcnt vmcnt(10)
-; GFX10-NEXT:    v_cndmask_b32_e64 v50, v20, v69, s14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v37, v68, v65, s13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v65
+; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v68
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v67
+; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v66
 ; GFX10-NEXT:    s_waitcnt vmcnt(9)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v22
-; GFX10-NEXT:    s_waitcnt vmcnt(8)
-; GFX10-NEXT:    v_cndmask_b32_e64 v22, v24, v22, s15
-; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; GFX10-NEXT:    s_waitcnt vmcnt(7)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v30
-; GFX10-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-NEXT:    v_cndmask_b32_e64 v30, v0, v30, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, v0, v2, s16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_waitcnt vmcnt(5)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v34
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v35, v34, s17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX10-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v4, v2, s5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v32
-; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v33
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_cndmask_b32_e64 v33, v8, v33, s19
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, v8, v69, s17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v69
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v10, v32, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, v4, v3, s18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_cndmask_b32_e64 v64, v10, v12, s19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e32 v66, v12, v6, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v12
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v35, v54, s18
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v65, s20
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v64, s21
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s22
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v53, s23
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v52, s24
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v24, v51, s25
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, v20, v49, s26
-; GFX10-NEXT:    v_cndmask_b32_e64 v20, v48, v38, s27
-; GFX10-NEXT:    v_cndmask_b32_e64 v24, v37, v29, s28
-; GFX10-NEXT:    v_cndmask_b32_e64 v21, v25, v21, s29
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s31
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v16, v13, s30
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, v19, v17, s34
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, v28, v26, s35
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v68, v67, s13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v3
-; GFX10-NEXT:    v_or_b32_sdwa v0, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v3, v23, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v4, v27, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v5, v36, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v6, v39, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v7, v50, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v8, v22, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v9, v30, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v10, v55, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v11, v66, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v12, v32, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v13, v33, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v14, v34, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v15, v15, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v1, v6, s15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v65, v14, v16, s20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, v66, v67, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v15, v13, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v20, v19, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v23, v22, s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v26, v25, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v29, v28, s24
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v33, v32, s25
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, v36, v35, s26
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, v39, v38, s27
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, v50, v49, s28
+; GFX10-NEXT:    v_cndmask_b32_e64 v28, v1, v6, s29
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v0, v2, s30
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, v8, v54, s31
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, v4, v3, s34
+; GFX10-NEXT:    v_cndmask_b32_e64 v33, v10, v12, s35
+; GFX10-NEXT:    v_cndmask_b32_e32 v16, v14, v16, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v7, v5, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v9, v11, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v13, v18, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v15, v21, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v19, v24, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v5, v20, v27, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v22, v30, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v7, v23, v34, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v8, v25, v37, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v9, v26, v48, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v10, v28, v51, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v11, v17, v52, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v12, v29, v53, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v13, v32, v55, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v14, v33, v64, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v15, v16, v65, 0x5040100
 ; GFX10-NEXT:    v_readlane_b32 s35, v31, 3
 ; GFX10-NEXT:    v_readlane_b32 s34, v31, 2
 ; GFX10-NEXT:    v_readlane_b32 s31, v31, 1
@@ -31468,250 +30034,207 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x20
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_u16 v31, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:12
 ; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:16
 ; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:20
 ; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:24
 ; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:28
 ; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:32
 ; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:36
 ; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:40
 ; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:44
 ; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:48
 ; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:52
 ; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:56
 ; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:60
 ; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v87, off, s32
-; GFX11-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v31
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v32
+; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:64
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
 ; GFX11-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 16, v33
-; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v32, v33, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
 ; GFX11-NEXT:    v_and_b32_e32 v30, 1, v30
 ; GFX11-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v35
-; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 16, v36
-; GFX11-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v34, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-NEXT:    v_and_b32_e32 v28, 1, v28
 ; GFX11-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 16, v37
-; GFX11-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 16, v38
-; GFX11-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v36, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-NEXT:    v_and_b32_e32 v26, 1, v26
 ; GFX11-NEXT:    s_waitcnt vmcnt(24)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 16, v39
-; GFX11-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v48
-; GFX11-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v38, v39, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-NEXT:    v_and_b32_e32 v24, 1, v24
 ; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 16, v49
-; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 16, v50
-; GFX11-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v48, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
+; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-NEXT:    v_and_b32_e32 v22, 1, v22
 ; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 16, v51
-; GFX11-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 16, v52
-; GFX11-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v50, v51, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
+; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
+; GFX11-NEXT:    v_and_b32_e32 v20, 1, v20
 ; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v53
-; GFX11-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 16, v54
-; GFX11-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v52, v53, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
+; GFX11-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
+; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
+; GFX11-NEXT:    v_and_b32_e32 v18, 1, v18
 ; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 16, v55
-; GFX11-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 16, v64
+; GFX11-NEXT:    v_cndmask_b32_e32 v14, v54, v55, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
+; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
+; GFX11-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX11-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-NEXT:    v_dual_cndmask_b32 v16, v64, v65 :: v_dual_and_b32 v11, 1, v11
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
+; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    v_cndmask_b32_e32 v18, v66, v67, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
+; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
+; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    v_cndmask_b32_e32 v20, v68, v69, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
+; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
+; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_cndmask_b32_e32 v22, v70, v71, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
+; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
+; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_cndmask_b32_e32 v24, v80, v81, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
+; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
+; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v83
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v83, v84, v83, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v26, v82, v83, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
+; GFX11-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
+; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v85
-; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 16, v65
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v85, v86, v85, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 16, v66
-; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 16, v67
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v82, v81, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
+; GFX11-NEXT:    v_cndmask_b32_e32 v28, v84, v85, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v68
-; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 16, v69
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v80, v71, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
-; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 16, v70
-; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 16, v71
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v70, v69, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
-; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 16, v80
-; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v81
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v68, v67, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
-; GFX11-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 16, v82
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v66, v65, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
-; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v87, 1, v87
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v64, v55, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v54, v53, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v52, v51, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11-NEXT:    v_and_b32_e32 v19, 1, v19
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v50, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v48, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v38, v37, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v36, v35, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v34, v33, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v32, v31, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v87
-; GFX11-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v85
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v86, v30, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v83
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v84, v28, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
-; GFX11-NEXT:    v_dual_cndmask_b32 v27, v147, v146 :: v_dual_lshlrev_b32 v28, 16, v28
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
+; GFX11-NEXT:    v_cndmask_b32_e32 v30, v86, v87, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v87
+; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v32, v33, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v25, v145, v144, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_cndmask_b32_e32 v23, v135, v134, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v34, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v36, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v133, v132, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v131, v130, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v38, v39, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v48, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v129, v128, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v119, v118, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v50, v51, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v52, v53, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v117, v116, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v115, v114, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v54, v55, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX11-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, v64, v65, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v113, v112, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v103, v102, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11-NEXT:    v_perm_b32 v8, v17, v16, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v19, v66, v67, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
+; GFX11-NEXT:    v_perm_b32 v9, v19, v18, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, v68, v69, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v101, v100, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v99, v98, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_perm_b32 v10, v21, v20, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v23, v70, v71, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
+; GFX11-NEXT:    v_perm_b32 v11, v23, v22, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v25, v80, v81, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v12, v25, v24, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v27, v82, v83, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
+; GFX11-NEXT:    v_perm_b32 v13, v27, v26, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v29, v84, v85, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v31
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v97, v96, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v4, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v5, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v6, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v7, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v8, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v9, v18, v19
-; GFX11-NEXT:    v_or_b32_e32 v10, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v11, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v12, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v13, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v14, v29, v28
-; GFX11-NEXT:    v_or_b32_e32 v15, v31, v30
+; GFX11-NEXT:    v_perm_b32 v14, v29, v28, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v31, v86, v87, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v15, v31, v30, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b
   ret <32 x bfloat> %op
@@ -31746,42 +30269,42 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX8-LABEL: v_fma_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f32_e32 v2, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
   ret bfloat %op
@@ -31821,59 +30344,59 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX8-LABEL: v_fma_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_fma_f32 v3, v5, v4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_fma_f32 v3, v5, v4, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_fma_f32 v3, v5, v4, v3
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v5, v4
 ; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v2, v3, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v2, v3, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
-; GFX11-NEXT:    v_perm_b32 v0, v2, v3, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v2, v3, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   ret <2 x bfloat> %op
@@ -31923,80 +30446,97 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX8-LABEL: v_fma_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_fma_f32 v3, v6, v5, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT:    v_fma_f32 v2, v6, v4, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_fma_f32 v3, v6, v5, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_fma_f32 v6, v8, v7, v6
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX9-NEXT:    v_fma_f32 v2, v6, v4, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
 ; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fmac_f32_e32 v6, v8, v7
-; GFX11-NEXT:    v_dual_fmac_f32 v4, v0, v2 :: v_dual_fmac_f32 v5, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v4, v6, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v5, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v9, v11, v10 :: v_dual_fmac_f32 v6, v8, v7
+; GFX11-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
@@ -32056,100 +30596,97 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX8-LABEL: v_fma_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
-; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_fma_f32 v3, v7, v5, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT:    v_fma_f32 v2, v6, v4, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_fma_f32 v6, v8, v7, v6
-; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_fma_f32 v3, v7, v5, v3
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX9-NEXT:    v_fma_f32 v2, v6, v4, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
 ; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
-; GFX10-NEXT:    v_fmac_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX10-NEXT:    v_perm_b32 v0, v4, v9, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_fmac_f32_e32 v9, v11, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_fmac_f32_e32 v5, v1, v3
-; GFX11-NEXT:    v_dual_fmac_f32 v6, v8, v7 :: v_dual_and_b32 v7, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX11-NEXT:    v_dual_fmac_f32 v7, v9, v8 :: v_dual_fmac_f32 v4, v0, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v4, v7, 0x3020706
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v5, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v9, v11, v10 :: v_dual_fmac_f32 v6, v8, v7
+; GFX11-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
   ret <4 x bfloat> %op
@@ -32188,50 +30725,50 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX8-LABEL: v_fmuladd_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmuladd_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v1 :: v_dual_and_b32 v1, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c)
   ret bfloat %op
@@ -32275,73 +30812,73 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fmuladd_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
+;
+; GFX8-LABEL: v_fmuladd_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; GFX9-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmuladd_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v1 :: v_dual_and_b32 v1, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
@@ -32349,7 +30886,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   ret <2 x bfloat> %op
@@ -32411,108 +30948,132 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-LABEL: v_fmuladd_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
 ; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_mul_f32_e32 v6, v8, v7
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v6
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmuladd_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v6, v7, v6 :: v_dual_lshlrev_b32 v7, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v6, v8, v7
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v3 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v7 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
@@ -32588,135 +31149,132 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-LABEL: v_fmuladd_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
-; GFX10-NEXT:    v_mul_f32_e32 v6, v8, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX10-NEXT:    v_mul_f32_e32 v6, v8, v7
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX10-NEXT:    v_add_f32_e32 v2, v3, v7
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v6
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v6
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmuladd_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_mul_f32 v6, v7, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v2, v6, v9 :: v_dual_mul_f32 v3, v8, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_dual_mul_f32 v6, v7, v6 :: v_dual_lshlrev_b32 v7, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX11-NEXT:    v_add_f32_e32 v3, v3, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v6, v8, v7
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v3 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v7 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
   ret <4 x bfloat> %op
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index e62dfe100642bc..9d21bf0fea7164 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -1119,10 +1119,9 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_brev_b32 s4, -2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_bfi_b32 v2, s4, v0, v1
+; VI-NEXT:    v_lshlrev_b32_e64 v0, 16, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_bfi_b32 v2, s4, v1, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -1133,9 +1132,8 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s3
+; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, s2, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
index e9bf515daabca9..a69fb35f8f0cb0 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
@@ -787,7 +787,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16:
@@ -797,7 +797,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext half %arg1 to float
@@ -1021,6 +1021,9 @@ define half @fmed3_f32_fpext_bf16(bfloat %arg0, bfloat %arg1, bfloat %arg2) #1 {
 ; GFX8-LABEL: fmed3_f32_fpext_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1028,6 +1031,9 @@ define half @fmed3_f32_fpext_bf16(bfloat %arg0, bfloat %arg1, bfloat %arg2) #1 {
 ; GFX9-LABEL: fmed3_f32_fpext_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1053,6 +1059,7 @@ define half @fmed3_f32_fpext_f16_bf16_0(bfloat %arg0, half %arg1, half %arg2) #1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1062,6 +1069,7 @@ define half @fmed3_f32_fpext_f16_bf16_0(bfloat %arg0, half %arg1, half %arg2) #1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1087,6 +1095,7 @@ define half @fmed3_f32_fpext_f16_bf16_1(half %arg0, bfloat %arg1, half %arg2) #1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1096,6 +1105,7 @@ define half @fmed3_f32_fpext_f16_bf16_1(half %arg0, bfloat %arg1, half %arg2) #1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1121,6 +1131,7 @@ define half @fmed3_f32_fpext_f16_bf16_2(half %arg0, half %arg1, bfloat %arg2) #1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1130,6 +1141,7 @@ define half @fmed3_f32_fpext_f16_bf16_2(half %arg0, half %arg1, bfloat %arg2) #1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 62e7a232d2b256..9a8ddb5bd38312 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1243,17 +1243,17 @@ define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat
 ; GFX9-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v2, v2, v3, s4
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v3, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 3a6ecc21494893..5f2a8c4ba98247 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -1875,7 +1875,7 @@ define void @void_func_bf16_inreg(bfloat inreg %arg0) #0 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    global_store_short_d16_hi v[0:1], v0, off
+; GFX9-NEXT:    global_store_short v[0:1], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1883,7 +1883,7 @@ define void @void_func_bf16_inreg(bfloat inreg %arg0) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    global_store_d16_hi_b16 v[0:1], v0, off
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %arg0, ptr addrspace(1) undef
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index f24cc6f177d625..ef2be37d62d9df 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -2818,13 +2818,13 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -2833,18 +2833,17 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; VI-NEXT:    v_and_b32_e32 v0, 1, v16
+; VI-NEXT:    v_and_b32_e32 v0, 1, v20
 ; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v17, off, s[4:7], 0
+; VI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_short v18, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_short v19, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v20, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_i1_i8_i16_bf16:
@@ -2860,13 +2859,14 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -2876,33 +2876,32 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v20
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_byte v17, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_short v18, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_short v19, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v20, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_i1_i8_i16_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:20
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_u8 v33, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v36, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u8 v32, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_u16 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v36, off, s32 offset:20
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
@@ -2911,9 +2910,8 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_and_b32_e32 v16, 1, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_and_b32_e32 v16, 1, v32
 ; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
@@ -2924,8 +2922,11 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v16, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    buffer_store_b8 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b16 v34, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    buffer_store_b16 v35, off, s[0:3], 0 dlc
@@ -2933,8 +2934,6 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b16 v36, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v32, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile i1 %arg1, ptr addrspace(1) undef
@@ -3166,35 +3165,29 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v20
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v19, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v13, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v16, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v12, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_short v20, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
@@ -3212,55 +3205,45 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v20
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v17, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v13, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v16, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v12, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v20, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:16
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v32
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v33
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
@@ -3277,6 +3260,12 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    buffer_store_b32 v32, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    buffer_store_b32 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3286,14 +3275,6 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b32 v36, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v38, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v33, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v37, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v32, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <2 x i16> %arg1, ptr addrspace(1) undef
@@ -4656,20 +4637,28 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 
 
 define void @void_func_bf16(bfloat %arg0) #0 {
-; CIGFX89-LABEL: void_func_bf16:
-; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIGFX89-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
-; CIGFX89-NEXT:    s_mov_b32 s6, -1
-; CIGFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
-; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+; CI-LABEL: void_func_bf16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_bf16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index d0a8e539056529..acadee27981710 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -2365,29 +2365,21 @@ define bfloat @bf16_func_void() #0 {
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: bf16_func_void:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: bf16_func_void:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    global_load_short_d16_hi v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX89-LABEL: bf16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: bf16_func_void:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load bfloat, ptr addrspace(1) undef
@@ -2440,28 +2432,14 @@ define <3 x bfloat> @v3bf16_func_void() #0 {
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v3bf16_func_void:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v3bf16_func_void:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_bfi_b32 v2, v2, 0, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX89-LABEL: v3bf16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v3bf16_func_void:
 ; GFX11:       ; %bb.0:
@@ -2470,11 +2448,6 @@ define <3 x bfloat> @v3bf16_func_void() #0 {
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_bfi_b32 v2, 0xffff, 0, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <3 x bfloat>, ptr addrspace(1) undef
   ret <3 x bfloat> %val
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index bdaa224439c534..145ab4ae6378b4 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -16599,7 +16599,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_bf16 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_bf16 at abs32@lo
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
@@ -16624,7 +16623,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_bf16 at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_bf16 at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
@@ -16652,7 +16650,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_bf16 at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_bf16 at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
@@ -16681,7 +16678,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_bf16 at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_bf16 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
@@ -16943,7 +16939,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -16970,7 +16965,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
@@ -16998,7 +16992,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
@@ -17027,7 +17020,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
@@ -17401,19 +17393,16 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_bf16 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_bf16 at abs32@lo
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -17431,19 +17420,16 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_bf16 at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_bf16 at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -17461,20 +17447,17 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_bf16 at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_bf16 at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -17492,19 +17475,16 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_bf16 at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_bf16 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -17755,19 +17735,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX9-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -17785,19 +17762,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 0
-; GFX10-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -17815,20 +17789,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 0
-; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -17846,19 +17817,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 0
-; GFX10-SCRATCH-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index 3f4b039a976ed2..f3c31d562e8af0 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -613,11 +613,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
   ; DAGISEL-GFX11-NEXT: {{  $}}
   ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -65536
-  ; DAGISEL-GFX11-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_]], [[COPY]], implicit $exec
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; DAGISEL-GFX11-NEXT:   [[S_PACK_LH_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LH_B32_B16 killed [[S_MOV_B32_1]], [[COPY1]]
-  ; DAGISEL-GFX11-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_PACK_LH_B32_B16_]], 0, killed [[V_AND_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+  ; DAGISEL-GFX11-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; DAGISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX11-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
@@ -629,11 +627,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
   ; DAGISEL-GFX10-NEXT: {{  $}}
   ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -65536
-  ; DAGISEL-GFX10-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_]], [[COPY]], implicit $exec
-  ; DAGISEL-GFX10-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; DAGISEL-GFX10-NEXT:   [[S_PACK_LH_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LH_B32_B16 killed [[S_MOV_B32_1]], [[COPY1]]
-  ; DAGISEL-GFX10-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_PACK_LH_B32_B16_]], 0, killed [[V_AND_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX10-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+  ; DAGISEL-GFX10-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; DAGISEL-GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX10-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index c8570d6f279a6b..c629432c8bec0b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -5731,19 +5731,19 @@ define float @v_exp_f32_from_fpext_bf16(bfloat %src) {
 ; VI-LABEL: v_exp_f32_from_fpext_bf16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-NEXT:    v_sub_f32_e32 v4, v0, v1
-; VI-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
-; VI-NEXT:    v_mul_f32_e32 v5, 0x39a3b295, v4
-; VI-NEXT:    v_mul_f32_e32 v4, 0x3fb8a000, v4
-; VI-NEXT:    v_rndne_f32_e32 v3, v2
-; VI-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-NEXT:    v_mul_f32_e32 v1, 0x39a3b295, v1
-; VI-NEXT:    v_sub_f32_e32 v2, v2, v3
-; VI-NEXT:    v_add_f32_e32 v1, v1, v4
-; VI-NEXT:    v_add_f32_e32 v1, v2, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    v_sub_f32_e32 v3, v0, v0
+; VI-NEXT:    v_mul_f32_e32 v1, 0x3fb8a000, v0
+; VI-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v3
+; VI-NEXT:    v_mul_f32_e32 v3, 0x3fb8a000, v3
+; VI-NEXT:    v_rndne_f32_e32 v2, v1
+; VI-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v0
+; VI-NEXT:    v_sub_f32_e32 v1, v1, v2
+; VI-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-NEXT:    v_add_f32_e32 v1, v1, v3
 ; VI-NEXT:    v_exp_f32_e32 v1, v1
-; VI-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; VI-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; VI-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; VI-NEXT:    s_mov_b32 s4, 0x42b17218
@@ -5757,6 +5757,7 @@ define float @v_exp_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-LABEL: v_exp_f32_from_fpext_bf16:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-NEXT:    v_rndne_f32_e32 v2, v1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index eee254398aefc2..09f51ee32c7151 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -5809,19 +5809,19 @@ define float @v_exp10_f32_from_fpext_bf16(bfloat %src) {
 ; VI-LABEL: v_exp10_f32_from_fpext_bf16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-NEXT:    v_sub_f32_e32 v4, v0, v1
-; VI-NEXT:    v_mul_f32_e32 v2, 0x40549000, v1
-; VI-NEXT:    v_mul_f32_e32 v5, 0x3a2784bc, v4
-; VI-NEXT:    v_mul_f32_e32 v4, 0x40549000, v4
-; VI-NEXT:    v_rndne_f32_e32 v3, v2
-; VI-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-NEXT:    v_mul_f32_e32 v1, 0x3a2784bc, v1
-; VI-NEXT:    v_sub_f32_e32 v2, v2, v3
-; VI-NEXT:    v_add_f32_e32 v1, v1, v4
-; VI-NEXT:    v_add_f32_e32 v1, v2, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    v_sub_f32_e32 v3, v0, v0
+; VI-NEXT:    v_mul_f32_e32 v1, 0x40549000, v0
+; VI-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v3
+; VI-NEXT:    v_mul_f32_e32 v3, 0x40549000, v3
+; VI-NEXT:    v_rndne_f32_e32 v2, v1
+; VI-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-NEXT:    v_mul_f32_e32 v4, 0x3a2784bc, v0
+; VI-NEXT:    v_sub_f32_e32 v1, v1, v2
+; VI-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-NEXT:    v_add_f32_e32 v1, v1, v3
 ; VI-NEXT:    v_exp_f32_e32 v1, v1
-; VI-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; VI-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; VI-NEXT:    s_mov_b32 s4, 0xc23369f4
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; VI-NEXT:    s_mov_b32 s4, 0x421a209b
@@ -5835,6 +5835,7 @@ define float @v_exp10_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-LABEL: v_exp10_f32_from_fpext_bf16:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x40549a78
 ; GFX900-NEXT:    v_rndne_f32_e32 v2, v1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 587109b9431a1a..b6cea034b9de74 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -1968,19 +1968,49 @@ define float @v_exp2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 }
 
 define float @v_exp2_f32_from_fpext_bf16(bfloat %src) {
-; GCN-LABEL: v_exp2_f32_from_fpext_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, 0xc2fc0000
-; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
-; GCN-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: v_exp2_f32_from_fpext_bf16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; SI-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-NEXT:    v_add_f32_e32 v0, v0, v2
+; SI-NEXT:    v_exp_f32_e32 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; SI-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_exp2_f32_from_fpext_bf16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; VI-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-NEXT:    v_exp_f32_e32 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; VI-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; VI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_exp2_f32_from_fpext_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_from_fpext_bf16:
 ; R600:       ; %bb.0:
@@ -2936,5 +2966,3 @@ declare <3 x half> @llvm.exp2.v3f16(<3 x half>) #2
 attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" }
 attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" }
 attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index 7723a724c60866..ca3336beccf773 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -173,8 +173,7 @@ define i1 @snan_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: snan_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
@@ -186,8 +185,7 @@ define i1 @snan_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: snan_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
@@ -199,8 +197,7 @@ define i1 @snan_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: snan_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v0
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
@@ -210,7 +207,6 @@ define i1 @snan_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: snan_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7f80, v0
@@ -234,8 +230,7 @@ define i1 @qnan_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: qnan_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -244,8 +239,7 @@ define i1 @qnan_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: qnan_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -254,8 +248,7 @@ define i1 @qnan_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: qnan_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -263,7 +256,6 @@ define i1 @qnan_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: qnan_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -285,7 +277,6 @@ define i1 @posinf_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: posinf_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -295,22 +286,20 @@ define i1 @posinf_bf16(bfloat %x) nounwind {
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v0, s4 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: posinf_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: posinf_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -331,7 +320,6 @@ define i1 @neginf_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: neginf_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0xff80
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -341,22 +329,20 @@ define i1 @neginf_bf16(bfloat %x) nounwind {
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0xff80
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v0, s4 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: neginf_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0xffffff80
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: neginf_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -382,7 +368,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: posnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -395,7 +380,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: posnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -408,7 +392,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: posnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -420,7 +403,6 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: posnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -450,7 +432,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: negnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -463,7 +444,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: negnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -476,7 +456,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: negnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -488,7 +467,6 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: negnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -515,8 +493,7 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: possubnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, -1
-; GFX8CHECK-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -525,8 +502,7 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: possubnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, -1
-; GFX9CHECK-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f
 ; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -535,7 +511,6 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: possubnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -544,7 +519,6 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: possubnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -570,7 +544,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: negsubnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
@@ -583,7 +556,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: negsubnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
@@ -596,7 +568,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: negsubnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, -1
@@ -608,7 +579,6 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: negsubnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, -1
@@ -632,7 +602,6 @@ define i1 @poszero_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: poszero_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -640,23 +609,20 @@ define i1 @poszero_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: poszero_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v0, v1 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: poszero_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: poszero_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -677,7 +643,6 @@ define i1 @negzero_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: negzero_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -687,22 +652,20 @@ define i1 @negzero_bf16(bfloat %x) nounwind {
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x8000
-; GFX9CHECK-NEXT:    v_cmp_eq_u16_sdwa s[4:5], v0, s4 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: negzero_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0xffff8000
-; GFX10CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: negzero_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -723,7 +686,6 @@ define i1 @posfinite_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: posfinite_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -733,22 +695,20 @@ define i1 @posfinite_bf16(bfloat %x) nounwind {
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_cmp_lt_u16_sdwa s[4:5], v0, s4 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: posfinite_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
-; GFX10CHECK-NEXT:    v_cmp_lt_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f80, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: posfinite_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -772,7 +732,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: negfinite_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
@@ -784,7 +743,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: negfinite_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
@@ -796,7 +754,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: negfinite_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e64 s4, 0x7f80, v1
@@ -807,7 +764,6 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: negfinite_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e64 s0, 0x7f80, v1
@@ -831,8 +787,7 @@ define i1 @isnan_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: isnan_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -841,8 +796,7 @@ define i1 @isnan_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: isnan_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -851,8 +805,7 @@ define i1 @isnan_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: isnan_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -860,7 +813,6 @@ define i1 @isnan_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: isnan_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -882,8 +834,7 @@ define i1 @not_isnan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_isnan_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -892,8 +843,7 @@ define i1 @not_isnan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_isnan_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -902,8 +852,7 @@ define i1 @not_isnan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_isnan_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -911,7 +860,6 @@ define i1 @not_isnan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_isnan_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1172,8 +1120,7 @@ define i1 @isnan_bf16_strictfp(bfloat %x) strictfp nounwind {
 ; GFX8CHECK-LABEL: isnan_bf16_strictfp:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1182,8 +1129,7 @@ define i1 @isnan_bf16_strictfp(bfloat %x) strictfp nounwind {
 ; GFX9CHECK-LABEL: isnan_bf16_strictfp:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1192,8 +1138,7 @@ define i1 @isnan_bf16_strictfp(bfloat %x) strictfp nounwind {
 ; GFX10CHECK-LABEL: isnan_bf16_strictfp:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1201,7 +1146,6 @@ define i1 @isnan_bf16_strictfp(bfloat %x) strictfp nounwind {
 ; GFX11CHECK-LABEL: isnan_bf16_strictfp:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1223,8 +1167,7 @@ define i1 @isinf_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: isinf_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1233,8 +1176,7 @@ define i1 @isinf_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: isinf_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1243,8 +1185,7 @@ define i1 @isinf_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: isinf_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1252,7 +1193,6 @@ define i1 @isinf_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: isinf_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1274,8 +1214,7 @@ define i1 @isfinite_bf16(bfloat %x) nounwind {
 ; GFX8CHECK-LABEL: isfinite_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1284,8 +1223,7 @@ define i1 @isfinite_bf16(bfloat %x) nounwind {
 ; GFX9CHECK-LABEL: isfinite_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1294,8 +1232,7 @@ define i1 @isfinite_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-LABEL: isfinite_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1303,7 +1240,6 @@ define i1 @isfinite_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-LABEL: isfinite_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1325,8 +1261,7 @@ define i1 @issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: issubnormal_or_zero_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1334,8 +1269,7 @@ define i1 @issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: issubnormal_or_zero_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1343,8 +1277,7 @@ define i1 @issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: issubnormal_or_zero_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1352,7 +1285,6 @@ define i1 @issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: issubnormal_or_zero_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1375,8 +1307,7 @@ define i1 @not_issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_issubnormal_or_zero_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX8CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1384,8 +1315,7 @@ define i1 @not_issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_issubnormal_or_zero_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX9CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1393,8 +1323,7 @@ define i1 @not_issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_issubnormal_or_zero_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f80
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1402,7 +1331,6 @@ define i1 @not_issubnormal_or_zero_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_issubnormal_or_zero_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1427,8 +1355,7 @@ define i1 @isnormal_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: isnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f00
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
@@ -1438,8 +1365,7 @@ define i1 @isnormal_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: isnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f00
 ; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
@@ -1449,8 +1375,7 @@ define i1 @isnormal_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: isnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1459,7 +1384,6 @@ define i1 @isnormal_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: isnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
@@ -1484,8 +1408,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_isnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7eff
 ; GFX8CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
@@ -1495,8 +1418,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_isnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7eff
 ; GFX9CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
@@ -1506,8 +1428,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_isnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
 ; GFX10CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1516,7 +1437,6 @@ define i1 @not_isnormal_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_isnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
 ; GFX11CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
@@ -1544,7 +1464,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_is_plus_normal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -1557,7 +1476,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_is_plus_normal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -1570,7 +1488,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_is_plus_normal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -1582,7 +1499,6 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_is_plus_normal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -1612,7 +1528,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_is_neg_normal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -1625,7 +1540,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_is_neg_normal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, 0xff80, v0
@@ -1638,7 +1552,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_is_neg_normal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -1650,7 +1563,6 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_is_neg_normal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
@@ -1676,8 +1588,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: issubnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
@@ -1687,8 +1598,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: issubnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f
 ; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
@@ -1698,8 +1608,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: issubnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1708,7 +1617,6 @@ define i1 @issubnormal_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: issubnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0
@@ -1732,8 +1640,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_issubnormal_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7e
 ; GFX8CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
@@ -1743,8 +1650,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_issubnormal_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v0, -1, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7e
 ; GFX9CHECK-NEXT:    v_cmp_lt_u16_e32 vcc, s4, v0
@@ -1754,8 +1660,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_issubnormal_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
 ; GFX10CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7e, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1764,7 +1669,6 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_issubnormal_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, -1
 ; GFX11CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7e, v0
@@ -1786,8 +1690,7 @@ define i1 @iszero_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: iszero_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1795,8 +1698,7 @@ define i1 @iszero_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: iszero_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1804,8 +1706,7 @@ define i1 @iszero_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: iszero_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1813,7 +1714,6 @@ define i1 @iszero_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: iszero_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1834,8 +1734,7 @@ define i1 @not_iszero_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_iszero_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1843,8 +1742,7 @@ define i1 @not_iszero_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_iszero_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1852,8 +1750,7 @@ define i1 @not_iszero_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_iszero_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1861,7 +1758,6 @@ define i1 @not_iszero_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_iszero_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1883,7 +1779,6 @@ define i1 @ispositive_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: ispositive_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1893,22 +1788,20 @@ define i1 @ispositive_bf16(bfloat %x) {
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
-; GFX9CHECK-NEXT:    v_cmp_lt_u16_sdwa s[4:5], v0, s4 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: ispositive_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f81
-; GFX10CHECK-NEXT:    v_cmp_lt_u16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: ispositive_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1938,7 +1831,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_ispositive_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s6, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
@@ -1955,7 +1847,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_ispositive_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s6, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
@@ -1972,7 +1863,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_ispositive_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s5, 0xff80, v0
@@ -1987,7 +1877,6 @@ define i1 @not_ispositive_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_ispositive_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s1, 0xff80, v0
@@ -2022,7 +1911,6 @@ define i1 @isnegative_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: isnegative_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
@@ -2037,7 +1925,6 @@ define i1 @isnegative_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: isnegative_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v0
@@ -2052,7 +1939,6 @@ define i1 @isnegative_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: isnegative_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s5, 0xff80, v0
@@ -2065,7 +1951,6 @@ define i1 @isnegative_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: isnegative_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s1, 0xff80, v0
@@ -2095,7 +1980,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_isnegative_bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX8CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -2108,7 +1992,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_isnegative_bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX9CHECK-NEXT:    v_cmp_gt_u16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -2121,7 +2004,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_isnegative_bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v1
@@ -2132,7 +2014,6 @@ define i1 @not_isnegative_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_isnegative_bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7f80, v1
@@ -2158,8 +2039,7 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: iszero_or_nan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2170,8 +2050,7 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: iszero_or_nan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2182,8 +2061,7 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: iszero_or_nan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
@@ -2193,7 +2071,6 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: iszero_or_nan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
@@ -2220,8 +2097,7 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX8CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2232,8 +2108,7 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX9CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2244,8 +2119,7 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX10CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
@@ -2255,7 +2129,6 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX11CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
@@ -2282,8 +2155,7 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX8CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2294,8 +2166,7 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX9CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2306,8 +2177,7 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX10CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
@@ -2317,7 +2187,6 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX11CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
@@ -2344,8 +2213,7 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_iszero_or_nan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
@@ -2356,8 +2224,7 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_iszero_or_nan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
@@ -2368,8 +2235,7 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_iszero_or_nan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX10CHECK-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
@@ -2379,7 +2245,6 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_iszero_or_nan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0
@@ -2406,8 +2271,7 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX8CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
@@ -2418,8 +2282,7 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX9CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
@@ -2430,8 +2293,7 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX10CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX10CHECK-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
@@ -2441,7 +2303,6 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
 ; GFX11CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0
@@ -2468,8 +2329,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX8CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
@@ -2480,8 +2340,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX9CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f81
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], 0, v0
@@ -2492,8 +2351,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX10CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX10CHECK-NEXT:    v_cmp_ne_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
@@ -2503,7 +2361,6 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
 ; GFX11CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0
 ; GFX11CHECK-NEXT:    v_cmp_ne_u16_e64 s0, 0, v0
@@ -2530,8 +2387,7 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: iszero_or_qnan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2542,8 +2398,7 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: iszero_or_qnan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fbf
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e64 s[4:5], 0, v0
@@ -2554,8 +2409,7 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: iszero_or_qnan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s4, 0, v0
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
@@ -2565,7 +2419,6 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: iszero_or_qnan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
@@ -2595,8 +2448,7 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: iszero_or_snan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
@@ -2610,8 +2462,7 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: iszero_or_snan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
@@ -2625,8 +2476,7 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: iszero_or_snan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s5, 0, v0
@@ -2638,7 +2488,6 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: iszero_or_snan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7f80, v0
@@ -2679,8 +2528,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_iszero_or_qnan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
 ; GFX8CHECK-NEXT:    s_movk_i32 s8, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
@@ -2702,8 +2550,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_iszero_or_qnan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fc0
 ; GFX9CHECK-NEXT:    s_movk_i32 s8, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
@@ -2725,8 +2572,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_iszero_or_qnan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v0, -1
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v0
@@ -2744,7 +2590,6 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_iszero_or_qnan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v0, -1
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
@@ -2789,8 +2634,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_iszero_or_snan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_add_u16_e32 v1, -1, v0
@@ -2810,8 +2654,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_iszero_or_snan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_add_u16_e32 v1, -1, v0
@@ -2831,8 +2674,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_iszero_or_snan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v0, -1
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v2, v0, 0xff80
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
@@ -2848,7 +2690,6 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_iszero_or_snan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v0, -1
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v2, v0, 0xff80
@@ -2879,8 +2720,7 @@ define i1 @isinf_or_nan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: isinf_or_nan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f7f
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -2889,8 +2729,7 @@ define i1 @isinf_or_nan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: isinf_or_nan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f7f
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -2899,8 +2738,7 @@ define i1 @isinf_or_nan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: isinf_or_nan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f7f, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -2908,7 +2746,6 @@ define i1 @isinf_or_nan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: isinf_or_nan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f7f, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -2931,8 +2768,7 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) {
 ; GFX8CHECK-LABEL: not_isinf_or_nan_bf16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -2941,8 +2777,7 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) {
 ; GFX9CHECK-LABEL: not_isinf_or_nan_bf16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_gt_i16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -2951,8 +2786,7 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) {
 ; GFX10CHECK-LABEL: not_isinf_or_nan_bf16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -2960,7 +2794,6 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) {
 ; GFX11CHECK-LABEL: not_isinf_or_nan_bf16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -2983,8 +2816,7 @@ define i1 @isfinite_or_nan_f(bfloat %x) {
 ; GFX8CHECK-LABEL: isfinite_or_nan_f:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -2993,8 +2825,7 @@ define i1 @isfinite_or_nan_f(bfloat %x) {
 ; GFX9CHECK-LABEL: isfinite_or_nan_f:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_ne_u16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -3003,8 +2834,7 @@ define i1 @isfinite_or_nan_f(bfloat %x) {
 ; GFX10CHECK-LABEL: isfinite_or_nan_f:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -3012,7 +2842,6 @@ define i1 @isfinite_or_nan_f(bfloat %x) {
 ; GFX11CHECK-LABEL: isfinite_or_nan_f:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -3035,8 +2864,7 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) {
 ; GFX8CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX8CHECK:       ; %bb.0: ; %entry
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX8CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -3045,8 +2873,7 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) {
 ; GFX9CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX9CHECK:       ; %bb.0: ; %entry
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
 ; GFX9CHECK-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v0
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -3055,8 +2882,7 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) {
 ; GFX10CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX10CHECK:       ; %bb.0: ; %entry
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -3064,7 +2890,6 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) {
 ; GFX11CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX11CHECK:       ; %bb.0: ; %entry
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -3090,4 +2915,3 @@ attributes #1 = { "denormal-fp-math"="ieee,dynamic" }
 ; GFX7SELDAG: {{.*}}
 ; GFX8SELDAG: {{.*}}
 ; GFX9SELDAG: {{.*}}
-; GFX7GLISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index a0b2d3b32b7957..1d5d39c6c0ba96 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -6234,6 +6234,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; VI-LABEL: v_log_f32_from_fpext_bf16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -6260,6 +6261,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-LABEL: v_log_f32_from_fpext_bf16:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -6283,22 +6285,23 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; GFX1100-LABEL: v_log_f32_from_fpext_bf16:
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
 ; GFX1100-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 5ba72612321a6a..dc78886a8c1087 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -6234,6 +6234,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; VI-LABEL: v_log10_f32_from_fpext_bf16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -6260,6 +6261,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-LABEL: v_log10_f32_from_fpext_bf16:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -6283,22 +6285,23 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; GFX1100-LABEL: v_log10_f32_from_fpext_bf16:
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
 ; GFX1100-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 6ccef4c02ab3b1..694acb42e6b030 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -2727,28 +2727,60 @@ define float @v_log2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 }
 
 define float @v_log2_f32_from_fpext_bf16(bfloat %src) {
-; GFX689-LABEL: v_log2_f32_from_fpext_bf16:
-; GFX689:       ; %bb.0:
-; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-NEXT:    s_mov_b32 s4, 0x800000
-; GFX689-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX689-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
-; GFX689-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX689-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: v_log2_f32_from_fpext_bf16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x800000
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; SI-NEXT:    v_mov_b32_e32 v2, 0x4f800000
+; SI-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; SI-NEXT:    v_mul_f32_e32 v0, v0, v2
+; SI-NEXT:    v_log_f32_e32 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_log2_f32_from_fpext_bf16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    s_mov_b32 s4, 0x800000
+; VI-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; VI-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; VI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-NEXT:    v_log_f32_e32 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_log2_f32_from_fpext_bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_mov_b32 s4, 0x800000
+; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0x4f800000
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_f32_from_fpext_bf16:
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1100-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -3973,5 +4005,3 @@ declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #2
 attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" }
 attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" }
 attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 5296ef1f886789..7367385351b99d 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -1422,7 +1422,7 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; VI-NEXT:    s_cbranch_execnz .LBB10_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_lshrrev_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: lds_atomic_fadd_ret_bf16:
@@ -1452,7 +1452,7 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX9-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_lshrrev_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fadd_ret_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll
index 5b9866a3c91571..8ca33dce73f880 100644
--- a/llvm/test/CodeGen/AMDGPU/select-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll
@@ -303,8 +303,9 @@ ret:
   ret void
 }
 
+; FIXME: This shouldn't have the 0 initialization
 ; GCN-LABEL: {{^}}undef_v3bf16:
-; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
 ; GCN: s_cbranch_vccnz
 define amdgpu_kernel void @undef_v3bf16(ptr addrspace(3) %ptr, i1 %cond) {
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 844aa57de05ce3..dfe98bbbaddf9e 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -2677,10 +2677,7 @@ define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_23uu:
@@ -2688,8 +2685,6 @@ define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_23uu:
@@ -2697,9 +2692,6 @@ define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -2713,13 +2705,10 @@ define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_234u:
@@ -2728,11 +2717,9 @@ define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_234u:
@@ -2740,13 +2727,7 @@ define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -2786,12 +2767,8 @@ define <4 x bfloat> @shuffle_v4bf16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_u3u1:
@@ -2799,10 +2776,7 @@ define <4 x bfloat> @shuffle_v4bf16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_u3u1:
@@ -2810,11 +2784,7 @@ define <4 x bfloat> @shuffle_v4bf16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -2827,10 +2797,7 @@ define <4 x bfloat> @shuffle_v4bf16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_u3uu:
@@ -2838,8 +2805,6 @@ define <4 x bfloat> @shuffle_v4bf16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_u3uu:
@@ -2847,9 +2812,6 @@ define <4 x bfloat> @shuffle_v4bf16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -2864,7 +2826,7 @@ define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX9-NEXT:    v_alignbit_b32 v0, s4, v5, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2875,7 +2837,7 @@ define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX10-NEXT:    v_alignbit_b32 v0, s4, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2886,7 +2848,7 @@ define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -2902,7 +2864,7 @@ define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX9-NEXT:    v_alignbit_b32 v0, s4, v5, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2913,7 +2875,7 @@ define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX10-NEXT:    v_alignbit_b32 v0, s4, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2924,7 +2886,7 @@ define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -2937,29 +2899,22 @@ define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_35u5:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v5, s5, v1
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_35u5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020706
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v2
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_35u5:
@@ -2968,13 +2923,7 @@ define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -2988,13 +2937,10 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v6, v4, s4
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    v_perm_b32 v0, v4, v6, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_357u:
@@ -3003,10 +2949,8 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v6, v4, 0x3020706
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_357u:
@@ -3015,11 +2959,8 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v4, v0, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_alignbit_b32 v1, s0, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3032,10 +2973,7 @@ define <4 x bfloat> @shuffle_v4bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3044,8 +2982,6 @@ define <4 x bfloat> @shuffle_v4bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3054,9 +2990,6 @@ define <4 x bfloat> @shuffle_v4bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3098,13 +3031,10 @@ define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0145:
@@ -3113,11 +3043,9 @@ define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0145:
@@ -3125,13 +3053,7 @@ define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3143,26 +3065,23 @@ define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_0167:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0167:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off
-; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0167:
@@ -3170,10 +3089,6 @@ define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3187,12 +3102,8 @@ define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_2301:
@@ -3200,10 +3111,7 @@ define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_2301:
@@ -3211,11 +3119,7 @@ define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3228,10 +3132,7 @@ define <4 x bfloat> @shuffle_v4bf16_2323(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3240,8 +3141,6 @@ define <4 x bfloat> @shuffle_v4bf16_2323(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3250,9 +3149,6 @@ define <4 x bfloat> @shuffle_v4bf16_2323(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3267,13 +3163,10 @@ define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_2345:
@@ -3282,11 +3175,9 @@ define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_2345:
@@ -3294,13 +3185,7 @@ define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3312,26 +3197,23 @@ define <4 x bfloat> @shuffle_v4bf16_2367(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_2367:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_2367:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_2367:
@@ -3339,10 +3221,6 @@ define <4 x bfloat> @shuffle_v4bf16_2367(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3357,13 +3235,10 @@ define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_4501:
@@ -3372,11 +3247,9 @@ define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_4501:
@@ -3385,12 +3258,8 @@ define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3402,26 +3271,23 @@ define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_4523:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_4523:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_4523:
@@ -3430,9 +3296,7 @@ define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3446,10 +3310,7 @@ define <4 x bfloat> @shuffle_v4bf16_4545(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3458,8 +3319,6 @@ define <4 x bfloat> @shuffle_v4bf16_4545(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3468,9 +3327,6 @@ define <4 x bfloat> @shuffle_v4bf16_4545(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3512,13 +3368,10 @@ define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_6701:
@@ -3527,11 +3380,9 @@ define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_6701:
@@ -3540,12 +3391,8 @@ define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3557,26 +3404,23 @@ define <4 x bfloat> @shuffle_v4bf16_6723(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_6723:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_6723:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_6723:
@@ -3585,9 +3429,7 @@ define <4 x bfloat> @shuffle_v4bf16_6723(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3601,12 +3443,8 @@ define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_6745:
@@ -3614,10 +3452,7 @@ define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_6745:
@@ -3625,11 +3460,7 @@ define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3642,10 +3473,7 @@ define <4 x bfloat> @shuffle_v4bf16_6767(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3654,8 +3482,6 @@ define <4 x bfloat> @shuffle_v4bf16_6767(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3664,9 +3490,6 @@ define <4 x bfloat> @shuffle_v4bf16_6767(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[2:3], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3679,43 +3502,33 @@ define <4 x bfloat> @shuffle_v4bf16_2356(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_2356:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    v_alignbit_b32 v1, v6, v5, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v6
-; GFX9-NEXT:    v_and_or_b32 v0, v6, s4, v0
-; GFX9-NEXT:    v_perm_b32 v1, v4, v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_2356:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v6
+; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v6, v0
-; GFX10-NEXT:    v_perm_b32 v1, v4, v1, 0x3020706
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_2356:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v3
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3729,10 +3542,8 @@ define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v6, v5, 16
-; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3746,7 +3557,6 @@ define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_5623:
@@ -3756,8 +3566,6 @@ define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x3020706
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3770,17 +3578,11 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_3456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_alignbit_b32 v2, v4, v6, 16
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_perm_b32 v1, v4, v0, s5
-; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v3
+; GFX9-NEXT:    v_alignbit_b32 v0, v4, v6, 16
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_3456:
@@ -3790,10 +3592,7 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
-; GFX10-NEXT:    v_alignbit_b32 v2, v5, v4, 16
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
-; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x3020706
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_3456:
@@ -3803,12 +3602,7 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX11-NEXT:    v_alignbit_b32 v2, v2, v1, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v3
+; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3822,43 +3616,32 @@ define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s5
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_5634:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
-; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x3020706
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_5634:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v2, v0, v4, 16
-; GFX11-NEXT:    v_alignbit_b32 v1, v1, v0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
+; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3872,43 +3655,33 @@ define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_5734:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
-; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020706
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x3020706
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_5734:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_perm_b32 v0, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v2, v0, v4, 16
-; GFX11-NEXT:    v_perm_b32 v1, v0, v1, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
+; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3921,10 +3694,9 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3933,8 +3705,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3943,9 +3714,8 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3959,11 +3729,8 @@ define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3973,8 +3740,6 @@ define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3984,9 +3749,6 @@ define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4001,16 +3763,11 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    s_mov_b32 s5, 0x7060706
-; GFX9-NEXT:    s_mov_b32 s6, 0x3020706
-; GFX9-NEXT:    s_mov_b32 s7, 0x3020504
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT:    v_perm_b32 v2, v1, v1, s5
-; GFX9-NEXT:    v_and_or_b32 v3, v1, s4, v0
-; GFX9-NEXT:    v_perm_b32 v0, v1, v2, s6
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s7
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v1, v1, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v1, s5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_1100:
@@ -4018,11 +3775,8 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v1, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT:    v_perm_b32 v2, v1, v1, 0x7060706
-; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v1, v0
-; GFX10-NEXT:    v_perm_b32 v0, v1, v2, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x3020504
+; GFX10-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_1100:
@@ -4030,13 +3784,8 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_perm_b32 v2, v1, v1, 0x7060706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v3, 0xffff, v1, v0
-; GFX11-NEXT:    v_perm_b32 v0, v1, v2, 0x3020706
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x3020504
+; GFX11-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -4051,10 +3800,8 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v5, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4063,10 +3810,8 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v5, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4075,11 +3820,9 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -4093,14 +3836,9 @@ define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v1, v0, v0, s4
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_2333:
@@ -4108,11 +3846,7 @@ define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_2333:
@@ -4120,13 +3854,7 @@ define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -4139,14 +3867,9 @@ define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v1, v0, v0, s4
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_6667:
@@ -4154,11 +3877,7 @@ define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_6667:
@@ -4166,13 +3885,7 @@ define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -4185,10 +3898,7 @@ define <4 x bfloat> @shuffle_v8bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4197,8 +3907,6 @@ define <4 x bfloat> @shuffle_v8bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4207,9 +3915,6 @@ define <4 x bfloat> @shuffle_v8bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
@@ -4251,13 +3956,10 @@ define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:8
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8bf16_4589:
@@ -4266,11 +3968,9 @@ define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:8
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8bf16_4589:
@@ -4278,13 +3978,7 @@ define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:8
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
@@ -4296,26 +3990,23 @@ define <4 x bfloat> @shuffle_v8bf16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrsp
 ; GFX9-LABEL: shuffle_v8bf16_10_11_2_3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8bf16_10_11_2_3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8bf16_10_11_2_3:
@@ -4324,9 +4015,7 @@ define <4 x bfloat> @shuffle_v8bf16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrsp
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
@@ -4341,10 +4030,8 @@ define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrsp
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v6, v5, 16
-; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4358,7 +4045,6 @@ define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrsp
 ; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8bf16_13_14_2_3:
@@ -4368,8 +4054,6 @@ define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrsp
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x3020706
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
@@ -4383,10 +4067,9 @@ define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    v_perm_b32 v1, v1, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v3bf16_0122:
@@ -4394,8 +4077,7 @@ define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v3bf16_0122:
@@ -4403,9 +4085,7 @@ define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <3 x bfloat>, ptr addrspace(1) %arg1
@@ -4418,11 +4098,8 @@ define <4 x bfloat> @shuffle_v2bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v0, v0, 16
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2bf16_0122:
@@ -4431,8 +4108,6 @@ define <4 x bfloat> @shuffle_v2bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v1, v0, v0, 16
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2bf16_0122:
@@ -4441,9 +4116,6 @@ define <4 x bfloat> @shuffle_v2bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v1, v0, v0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <2 x bfloat>, ptr addrspace(1) %arg1
@@ -4507,38 +4179,38 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[6:7]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1]
-; GFX9-NEXT:    s_mov_b32 s0, 0x3020706
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v6, s[2:3]
+; GFX9-NEXT:    s_mov_b32 s0, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_fma_f32 v7, v9, v8, v7
-; GFX9-NEXT:    v_fma_f32 v0, v9, v2, v0
-; GFX9-NEXT:    v_fma_f32 v8, v11, v8, v12
-; GFX9-NEXT:    v_fma_f32 v1, v11, v2, v1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_fma_f32 v7, v8, v9, v7
+; GFX9-NEXT:    v_fma_f32 v0, v8, v4, v0
+; GFX9-NEXT:    v_fma_f32 v4, v11, v4, v12
+; GFX9-NEXT:    v_fma_f32 v1, v11, v9, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
-; GFX9-NEXT:    v_fma_f32 v0, v4, v10, v0
-; GFX9-NEXT:    v_fma_f32 v2, v4, v3, v2
-; GFX9-NEXT:    v_fma_f32 v1, v5, v10, v1
-; GFX9-NEXT:    v_fma_f32 v3, v5, v3, v7
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_fma_f32 v0, v2, v10, v0
+; GFX9-NEXT:    v_fma_f32 v2, v2, v5, v7
+; GFX9-NEXT:    v_fma_f32 v1, v3, v5, v1
+; GFX9-NEXT:    v_fma_f32 v3, v3, v10, v4
 ; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s0
-; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s0
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s0
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4551,37 +4223,37 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x2
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[6:7]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_fmac_f32_e32 v7, v9, v8
-; GFX10-NEXT:    v_fmac_f32_e32 v0, v9, v2
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
-; GFX10-NEXT:    v_fmac_f32_e32 v12, v11, v2
-; GFX10-NEXT:    v_fmac_f32_e32 v1, v11, v8
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v9
+; GFX10-NEXT:    v_fmac_f32_e32 v0, v8, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v12, v11, v9
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v11, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v12
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_fmac_f32_e32 v0, v4, v10
-; GFX10-NEXT:    v_fmac_f32_e32 v5, v4, v3
-; GFX10-NEXT:    v_fmac_f32_e32 v7, v2, v10
-; GFX10-NEXT:    v_fmac_f32_e32 v1, v2, v3
-; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
-; GFX10-NEXT:    v_perm_b32 v1, v1, v7, 0x3020706
+; GFX10-NEXT:    v_fmac_f32_e32 v0, v2, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v2, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v3, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v3, v10
+; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v7, v1, 0x7060302
 ; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -4594,37 +4266,39 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[4:5]
-; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3]
-; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[0:1]
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_fmac_f32_e32 v1, v11, v8
-; GFX11-NEXT:    v_dual_fmac_f32 v12, v11, v2 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v1, v11, v4 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v1, v3, v10 :: v_dual_fmac_f32 v0, v8, v4
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v0, v9, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; GFX11-NEXT:    v_dual_fmac_f32 v1, v2, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v0, v2, v10
+; GFX11-NEXT:    v_fmac_f32_e32 v12, v11, v9
+; GFX11-NEXT:    v_fmac_f32_e32 v7, v8, v9
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v7, v9, v8 :: v_dual_fmac_f32 v0, v4, v10
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v12
+; GFX11-NEXT:    v_dual_fmac_f32 v4, v2, v5 :: v_dual_and_b32 v7, 0xffff0000, v12
+; GFX11-NEXT:    v_fmac_f32_e32 v7, v3, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fmac_f32_e32 v5, v4, v3
-; GFX11-NEXT:    v_fmac_f32_e32 v7, v2, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
-; GFX11-NEXT:    v_perm_b32 v1, v1, v7, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v4, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v7, v1, 0x7060302
 ; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4662,40 +4336,32 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_0456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
-; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v6, v4, s4
-; GFX9-NEXT:    v_perm_b32 v1, v4, v1, s5
+; GFX9-NEXT:    v_perm_b32 v0, v4, v6, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0456:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v6, v4, 0x1000504
-; GFX10-NEXT:    v_perm_b32 v1, v4, v1, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x5040100
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0456:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x1000504
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -4709,9 +4375,9 @@ define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: low16bits:
@@ -4720,7 +4386,7 @@ define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x1000504
+; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: low16bits:
@@ -4729,7 +4395,7 @@ define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x1000504
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
@@ -4745,9 +4411,9 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: hi16bits_v2bf16:
@@ -4756,7 +4422,7 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: hi16bits_v2bf16:
@@ -4765,7 +4431,7 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
@@ -4779,29 +4445,29 @@ define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1)
 ; GFX9-LABEL: low16hi16bits_v2bf16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020504
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v5, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: low16hi16bits_v2bf16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020504
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v5, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: low16hi16bits_v2bf16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020504
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4

>From 33531128c11f94c64ff872d5133eb21933cf24c7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 22 Dec 2023 10:46:19 +0700
Subject: [PATCH 3/4] AMDGPU: Make v4bf16 a legal type

Gets a few code quality improvements. A few cases are worse
from losing load narrowing.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |    19 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |    25 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |    13 +
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 11487 +++++++---------
 llvm/test/CodeGen/AMDGPU/function-args.ll     |    61 +-
 .../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll    |   101 +-
 llvm/test/CodeGen/AMDGPU/select-undef.ll      |     3 +-
 .../CodeGen/AMDGPU/vector_shuffle.packed.ll   |   399 +-
 8 files changed, 5678 insertions(+), 6430 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 2225bce25edf5c..5d39e7366f7cde 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -389,15 +389,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                      Custom);
   setOperationAction(
       ISD::EXTRACT_SUBVECTOR,
-      {MVT::v2f16,  MVT::v2i16,  MVT::v4f16,  MVT::v4i16,  MVT::v2f32,
-       MVT::v2i32,  MVT::v3f32,  MVT::v3i32,  MVT::v4f32,  MVT::v4i32,
-       MVT::v5f32,  MVT::v5i32,  MVT::v6f32,  MVT::v6i32,  MVT::v7f32,
-       MVT::v7i32,  MVT::v8f32,  MVT::v8i32,  MVT::v9f32,  MVT::v9i32,
-       MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32,
-       MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
-       MVT::v32f32, MVT::v32i32, MVT::v2f64,  MVT::v2i64,  MVT::v3f64,
-       MVT::v3i64,  MVT::v4f64,  MVT::v4i64,  MVT::v8f64,  MVT::v8i64,
-       MVT::v16f64, MVT::v16i64, MVT::v32i16, MVT::v32f16},
+      {MVT::v2f16,  MVT::v2i16,  MVT::v2bf16, MVT::v4f16,  MVT::v4i16,
+       MVT::v4bf16, MVT::v2f32,  MVT::v2i32,  MVT::v3f32,  MVT::v3i32,
+       MVT::v4f32,  MVT::v4i32,  MVT::v5f32,  MVT::v5i32,  MVT::v6f32,
+       MVT::v6i32,  MVT::v7f32,  MVT::v7i32,  MVT::v8f32,  MVT::v8i32,
+       MVT::v9f32,  MVT::v9i32,  MVT::v10i32, MVT::v10f32, MVT::v11i32,
+       MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16i16,
+       MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, MVT::v2f64,
+       MVT::v2i64,  MVT::v3f64,  MVT::v3i64,  MVT::v4f64,  MVT::v4i64,
+       MVT::v8f64,  MVT::v8i64,  MVT::v16f64, MVT::v16i64, MVT::v32i16,
+       MVT::v32f16},
       Custom);
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 91f347be68fa96..684269e3a4a72d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -164,6 +164,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
     addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
     addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
+    addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
     addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
     addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
     addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
@@ -312,10 +313,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
        {MVT::v8i32,  MVT::v8f32,  MVT::v9i32,  MVT::v9f32,  MVT::v10i32,
         MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
         MVT::v16i32, MVT::v16f32, MVT::v2i64,  MVT::v2f64,  MVT::v4i16,
-        MVT::v4f16,  MVT::v3i64,  MVT::v3f64,  MVT::v6i32,  MVT::v6f32,
-        MVT::v4i64,  MVT::v4f64,  MVT::v8i64,  MVT::v8f64,  MVT::v8i16,
-        MVT::v8f16,  MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
-        MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) {
+        MVT::v4f16,  MVT::v4bf16, MVT::v3i64,  MVT::v3f64,  MVT::v6i32,
+        MVT::v6f32,  MVT::v4i64,  MVT::v4f64,  MVT::v8i64,  MVT::v8f64,
+        MVT::v8i16,  MVT::v8f16,  MVT::v16i16, MVT::v16f16, MVT::v16i64,
+        MVT::v16f64, MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:
@@ -421,13 +422,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                      {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
                      Expand);
 
-  setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16}, Custom);
+  setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
+                     Custom);
 
   // Avoid stack access for these.
   // TODO: Generalize to more vector types.
   setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
                      {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
-                      MVT::v8i8, MVT::v4i16, MVT::v4f16},
+                      MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
                      Custom);
 
   // Deal with vec3 vector operations when widened to vec4.
@@ -667,11 +669,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
     setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
     AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
+    setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
 
     setOperationAction(ISD::STORE, MVT::v4i16, Promote);
     AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
     setOperationAction(ISD::STORE, MVT::v4f16, Promote);
     AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
+    setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
 
     setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
     AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
@@ -781,7 +787,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                        Custom);
 
     setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
-    setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16}, Custom);
+    setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
+                       Custom);
 
     if (Subtarget->hasPackedFP32Ops()) {
       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
@@ -6804,7 +6811,7 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
   SDLoc SL(Op);
   EVT VT = Op.getValueType();
 
-  if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+  if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
       VT == MVT::v8i16 || VT == MVT::v8f16) {
     EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
                                   VT.getVectorNumElements() / 2);
@@ -6870,7 +6877,7 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
     return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
   }
 
-  assert(VT == MVT::v2f16 || VT == MVT::v2i16);
+  assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
   assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
 
   SDValue Lo = Op.getOperand(0);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 5d84320c0de0e3..bf1f0d6abe477d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1548,6 +1548,19 @@ def : BitConvert <f64, v2i32, VReg_64>;
 def : BitConvert <v2i32, f64, VReg_64>;
 def : BitConvert <v4i16, v4f16, VReg_64>;
 def : BitConvert <v4f16, v4i16, VReg_64>;
+def : BitConvert <v4bf16, v2i32, VReg_64>;
+def : BitConvert <v2i32, v4bf16, VReg_64>;
+def : BitConvert <v4bf16, i64, VReg_64>;
+def : BitConvert <i64, v4bf16, VReg_64>;
+def : BitConvert <v4bf16, v4i16, VReg_64>;
+def : BitConvert <v4i16, v4bf16, VReg_64>;
+def : BitConvert <v4bf16, v4f16, VReg_64>;
+def : BitConvert <v4f16, v4bf16, VReg_64>;
+def : BitConvert <v4bf16, v2f32, VReg_64>;
+def : BitConvert <v2f32, v4bf16, VReg_64>;
+def : BitConvert <v4bf16, f64, VReg_64>;
+def : BitConvert <f64, v4bf16, VReg_64>;
+
 
 // FIXME: Make SGPR
 def : BitConvert <v2i32, v4f16, VReg_64>;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 5243262117e728..2a3417e2418552 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -4133,9 +4133,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    scratch_store_b32 v2, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v2, v0, off dlc
+; GFX11-NEXT:    scratch_store_b64 v2, v[0:1], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
@@ -4397,18 +4395,12 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 12, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX11-NEXT:    scratch_store_b32 v6, v3, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v4, v2, off offset:8 dlc
+; GFX11-NEXT:    scratch_store_b64 v4, v[2:3], off offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v4, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v4, v0, off dlc
+; GFX11-NEXT:    scratch_store_b64 v4, v[0:1], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -4759,28 +4751,18 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    v_writelane_b32 v9, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 28, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 24, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 20, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 12, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 24, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX11-NEXT:    scratch_store_b32 v10, v7, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v11, v6, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v12, v5, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v8, v4, off offset:16 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v13, v3, off dlc
+; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX11-NEXT:    scratch_store_b64 v10, v[6:7], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v8, v2, off offset:8 dlc
+; GFX11-NEXT:    scratch_store_b64 v8, v[4:5], off offset:16 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v8, v1, off offset:4 dlc
+; GFX11-NEXT:    scratch_store_b64 v8, v[2:3], off offset:8 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v8, v0, off dlc
+; GFX11-NEXT:    scratch_store_b64 v8, v[0:1], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -8691,83 +8673,52 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_add_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fadd_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_add_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
@@ -8818,82 +8769,81 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_add_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_add_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_add_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -8977,138 +8927,138 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v6, v9, v6
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_add_f32_e32 v4, v8, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v6, v9, v6
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_add_f32_e32 v5, v9, v5
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v4, v8, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_add_f32_e32 v4, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_add_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_add_f32_e32 v5, v10, v9
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_add_f32_e32 v6, v12, v11
 ; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_add_f32 v9, v11, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_add_f32_e32 v4, v11, v10
-; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_add_f32_e32 v5, v10, v9
-; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_add_f32_e32 v6, v12, v11
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX11-NEXT:    v_dual_add_f32 v10, v11, v10 :: v_dual_add_f32 v11, v13, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
@@ -9260,252 +9210,254 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_add_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_add_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_add_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v10, v17, v10
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_add_f32_e32 v8, v16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_add_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_add_f32_e32 v14, v17, v14
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_add_f32_e32 v13, v17, v13
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v12, v17, v12
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v11, v17, v11
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v10, v17, v10
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_add_f32_e32 v9, v17, v9
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v8, v16, v8
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v9
-; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v9
-; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v9
-; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v9
-; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v9
-; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v9
-; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
 ; GFX10-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX10-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_add_f32_e32 v14, v19, v18
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_add_f32_e32 v13, v21, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_add_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v18, v20, v19
 ; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_add_f32_e32 v19, v20, v19
+; GFX10-NEXT:    v_add_f32_e32 v20, v22, v21
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX10-NEXT:    v_add_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX10-NEXT:    v_add_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
-; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v10
-; GFX10-NEXT:    v_add_f32_e32 v9, v16, v11
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v11
 ; GFX10-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_add_f32_e32 v10, v18, v17
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v11
-; GFX10-NEXT:    v_add_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_add_f32_e32 v12, v17, v16
-; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
-; GFX11-NEXT:    v_add_f32_e32 v9, v20, v19
-; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX11-NEXT:    v_add_f32_e32 v8, v18, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_add_f32 v9, v16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX11-NEXT:    v_add_f32_e32 v13, v21, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v5, v5, v11
-; GFX11-NEXT:    v_add_f32_e32 v11, v13, v12
-; GFX11-NEXT:    v_add_f32_e32 v12, v17, v16
-; GFX11-NEXT:    v_dual_add_f32 v6, v6, v14 :: v_dual_add_f32 v7, v7, v15
-; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
+; GFX11-NEXT:    v_add_f32_e32 v14, v19, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX11-NEXT:    v_dual_add_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v19, v20, v19 :: v_dual_add_f32 v20, v22, v21
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <16 x bfloat> %a, %b
   ret <16 x bfloat> %op
@@ -9913,483 +9865,480 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_fadd_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_add_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v30, v14, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_add_f32_e32 v14, v32, v14
+; GFX8-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_add_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_add_f32_e32 v28, v32, v28
+; GFX8-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_add_f32_e32 v27, v32, v27
+; GFX8-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_add_f32_e32 v26, v32, v26
+; GFX8-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX8-NEXT:    v_add_f32_e32 v25, v32, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v14, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_add_f32_e32 v32, v32, v33
+; GFX8-NEXT:    v_add_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_add_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_add_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_add_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_add_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_add_f32_e32 v18, v33, v18
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_add_f32_e32 v16, v31, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_add_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v17
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_add_f32_e32 v7, v7, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_add_f32_e32 v8, v8, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT:    v_add_f32_e32 v9, v9, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT:    v_add_f32_e32 v10, v10, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT:    v_add_f32_e32 v11, v11, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT:    v_add_f32_e32 v12, v12, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT:    v_add_f32_e32 v13, v13, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX8-NEXT:    v_add_f32_e32 v14, v14, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT:    v_add_f32_e32 v15, v15, v17
-; GFX8-NEXT:    v_add_f32_e32 v16, v18, v16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
+; GFX8-NEXT:    v_alignbit_b32 v14, v16, v31, 16
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v32, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fadd_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_add_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_add_f32_e32 v30, v32, v30
+; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_add_f32_e32 v29, v32, v29
+; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_add_f32_e32 v28, v32, v28
+; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_add_f32_e32 v27, v32, v27
+; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_add_f32_e32 v26, v32, v26
+; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX9-NEXT:    v_add_f32_e32 v25, v32, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_add_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_add_f32_e32 v15, v15, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_add_f32_e32 v24, v33, v24
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_add_f32_e32 v23, v33, v23
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_add_f32_e32 v22, v33, v22
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_add_f32_e32 v21, v33, v21
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v20, v33, v20
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v19, v33, v19
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_add_f32_e32 v18, v33, v18
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v17
-; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v17
-; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v17
-; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v17
-; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v17
-; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v17
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_add_f32_e32 v8, v8, v17
-; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_add_f32_e32 v9, v9, v17
-; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_add_f32_e32 v10, v10, v17
-; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_add_f32_e32 v11, v11, v17
-; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_add_f32_e32 v12, v12, v17
-; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_add_f32_e32 v13, v13, v17
-; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_add_f32_e32 v14, v14, v17
-; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_add_f32_e32 v15, v15, v17
-; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v17, v33, v17
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX9-NEXT:    v_perm_b32 v15, v15, v32, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fadd_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_add_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v17
+; GFX10-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX10-NEXT:    v_add_f32_e32 v21, v53, v52
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_add_f32_e32 v22, v55, v54
-; GFX10-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_add_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v16
+; GFX10-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
 ; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX10-NEXT:    v_add_f32_e32 v32, v33, v32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_add_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_add_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX10-NEXT:    v_add_f32_e32 v34, v35, v34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
-; GFX10-NEXT:    v_add_f32_e32 v36, v37, v36
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_add_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_add_f32_e32 v25, v54, v53
+; GFX10-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_add_f32_e32 v24, v64, v55
+; GFX10-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_add_f32_e32 v23, v66, v65
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_add_f32_e32 v22, v68, v67
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_add_f32_e32 v21, v30, v34
+; GFX10-NEXT:    v_add_f32_e32 v29, v29, v36
+; GFX10-NEXT:    v_add_f32_e32 v28, v28, v38
+; GFX10-NEXT:    v_add_f32_e32 v27, v27, v48
+; GFX10-NEXT:    v_add_f32_e32 v26, v26, v50
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
-; GFX10-NEXT:    v_add_f32_e32 v38, v39, v38
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
-; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
-; GFX10-NEXT:    v_add_f32_e32 v48, v49, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
-; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX10-NEXT:    v_add_f32_e32 v50, v51, v50
-; GFX10-NEXT:    v_add_f32_e32 v23, v65, v64
-; GFX10-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_add_f32_e32 v24, v67, v66
-; GFX10-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_add_f32_e32 v25, v33, v68
-; GFX10-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_add_f32_e32 v16, v35, v16
-; GFX10-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_add_f32_e32 v17, v37, v17
-; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_add_f32_e32 v18, v39, v18
-; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_add_f32_e32 v19, v49, v19
-; GFX10-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v27, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v26, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v28, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v29, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
-; GFX10-NEXT:    v_add_f32_e32 v20, v20, v21
-; GFX10-NEXT:    v_add_f32_e32 v15, v15, v22
-; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX10-NEXT:    v_add_f32_e32 v16, v32, v16
+; GFX10-NEXT:    v_add_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
+; GFX11-NEXT:    v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v18 :: v_dual_add_f32 v3, v3, v19
+; GFX11-NEXT:    v_dual_add_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
+; GFX11-NEXT:    v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
 ; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX11-NEXT:    v_add_f32_e32 v26, v52, v51
+; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT:    v_add_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
 ; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
 ; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_add_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
-; GFX11-NEXT:    v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_add_f32 v9, v9, v25
-; GFX11-NEXT:    v_add_f32_e32 v25, v69, v68
-; GFX11-NEXT:    v_dual_add_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
-; GFX11-NEXT:    v_add_f32_e32 v27, v81, v80
-; GFX11-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX11-NEXT:    v_dual_add_f32 v28, v83, v82 :: v_dual_add_f32 v29, v85, v84
-; GFX11-NEXT:    v_dual_add_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v22, v55, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_add_f32_e32 v24, v64, v55
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_add_f32_e32 v23, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_dual_add_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_add_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_add_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_add_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_add_f32_e32 v37, v86, v85
+; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
 ; GFX11-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX11-NEXT:    v_dual_add_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_add_f32_e32 v23, v65, v64
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_dual_add_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
-; GFX11-NEXT:    v_add_f32_e32 v18, v39, v38
-; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_add_f32_e32 v19, v49, v48
-; GFX11-NEXT:    v_add_f32_e32 v17, v37, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX11-NEXT:    v_dual_add_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v21, v53, v52
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX11-NEXT:    v_add_f32_e32 v16, v35, v34
-; GFX11-NEXT:    v_add_f32_e32 v32, v33, v32
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
-; GFX11-NEXT:    v_add_f32_e32 v15, v15, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33
+; GFX11-NEXT:    v_dual_add_f32 v34, v80, v71 :: v_dual_add_f32 v35, v82, v81
+; GFX11-NEXT:    v_add_f32_e32 v36, v84, v83
+; GFX11-NEXT:    v_dual_add_f32 v16, v32, v16 :: v_dual_add_f32 v15, v15, v17
+; GFX11-NEXT:    v_perm_b32 v0, v0, v37, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v35, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v36, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v34, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <32 x bfloat> %a, %b
@@ -10681,83 +10630,52 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_fsub_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_sub_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_sub_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_sub_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fsub_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_sub_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
@@ -10808,82 +10726,81 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_fsub_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_sub_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fsub_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_sub_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fsub_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v6
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_sub_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fsub_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_sub_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_sub_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_sub_f32_e32 v5, v7, v6
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -11068,83 +10985,52 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_mul_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fmul_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_mul_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
@@ -11195,82 +11081,81 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_mul_f32_e32 v5, v7, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_mul_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_mul_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_mul_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_mul_f32_e32 v5, v7, v6
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -11354,138 +11239,138 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_mul_f32_e32 v6, v9, v6
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_mul_f32_e32 v4, v8, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_mul_f32_e32 v7, v9, v7
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v6, v9, v6
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_mul_f32_e32 v5, v9, v5
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v4, v8, v4
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v5
-; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_mul_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_mul_f32_e32 v4, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_mul_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_mul_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_mul_f32_e32 v5, v10, v9
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_mul_f32_e32 v6, v12, v11
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_mul_f32 v9, v11, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_mul_f32_e32 v4, v11, v10
-; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_mul_f32_e32 v5, v10, v9
-; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_mul_f32_e32 v6, v12, v11
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX11-NEXT:    v_dual_mul_f32 v10, v11, v10 :: v_dual_mul_f32 v11, v13, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
@@ -11637,252 +11522,254 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_mul_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_mul_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_mul_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_mul_f32_e32 v10, v17, v10
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_mul_f32_e32 v8, v16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX9-LABEL: v_fmul_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_mul_f32_e32 v14, v17, v14
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_mul_f32_e32 v13, v17, v13
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_mul_f32_e32 v12, v17, v12
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_mul_f32_e32 v11, v17, v11
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_mul_f32_e32 v10, v17, v10
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_mul_f32_e32 v9, v17, v9
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v8, v16, v8
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v9
-; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v9
-; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v9
-; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v9
-; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v9
-; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v9
-; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
 ; GFX10-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX10-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_mul_f32_e32 v14, v19, v18
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_mul_f32_e32 v13, v21, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_mul_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_mul_f32_e32 v18, v20, v19
 ; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_mul_f32_e32 v19, v20, v19
+; GFX10-NEXT:    v_mul_f32_e32 v20, v22, v21
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX10-NEXT:    v_mul_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX10-NEXT:    v_mul_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
-; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v10
-; GFX10-NEXT:    v_mul_f32_e32 v9, v16, v11
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v11
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_mul_f32_e32 v10, v18, v17
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v11
-; GFX10-NEXT:    v_mul_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_mul_f32_e32 v12, v17, v16
-; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
-; GFX11-NEXT:    v_mul_f32_e32 v9, v20, v19
-; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX11-NEXT:    v_mul_f32_e32 v8, v18, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_mul_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_mul_f32 v9, v16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX11-NEXT:    v_mul_f32_e32 v13, v21, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v11
-; GFX11-NEXT:    v_mul_f32_e32 v11, v13, v12
-; GFX11-NEXT:    v_mul_f32_e32 v12, v17, v16
-; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v14 :: v_dual_mul_f32 v7, v7, v15
-; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
+; GFX11-NEXT:    v_mul_f32_e32 v14, v19, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX11-NEXT:    v_dual_mul_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v19, v20, v19 :: v_dual_mul_f32 v20, v22, v21
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <16 x bfloat> %a, %b
   ret <16 x bfloat> %op
@@ -12290,483 +12177,480 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_fmul_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_mul_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v30, v14, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_mul_f32_e32 v14, v32, v14
+; GFX8-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_mul_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_mul_f32_e32 v28, v32, v28
+; GFX8-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_mul_f32_e32 v27, v32, v27
+; GFX8-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_mul_f32_e32 v26, v32, v26
+; GFX8-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX8-NEXT:    v_mul_f32_e32 v25, v32, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v14, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_mul_f32_e32 v32, v32, v33
+; GFX8-NEXT:    v_mul_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_mul_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_mul_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_mul_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_mul_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_mul_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_mul_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_mul_f32_e32 v18, v33, v18
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_mul_f32_e32 v16, v31, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_mul_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v17
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_mul_f32_e32 v8, v8, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT:    v_mul_f32_e32 v9, v9, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT:    v_mul_f32_e32 v10, v10, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT:    v_mul_f32_e32 v11, v11, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT:    v_mul_f32_e32 v12, v12, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT:    v_mul_f32_e32 v13, v13, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX8-NEXT:    v_mul_f32_e32 v14, v14, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT:    v_mul_f32_e32 v15, v15, v17
-; GFX8-NEXT:    v_mul_f32_e32 v16, v18, v16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
+; GFX8-NEXT:    v_alignbit_b32 v14, v16, v31, 16
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v32, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmul_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_mul_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_mul_f32_e32 v30, v32, v30
+; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_mul_f32_e32 v29, v32, v29
+; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_mul_f32_e32 v28, v32, v28
+; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_mul_f32_e32 v27, v32, v27
+; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_mul_f32_e32 v26, v32, v26
+; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX9-NEXT:    v_mul_f32_e32 v25, v32, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_mul_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_mul_f32_e32 v15, v15, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_mul_f32_e32 v24, v33, v24
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_mul_f32_e32 v23, v33, v23
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_mul_f32_e32 v22, v33, v22
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_mul_f32_e32 v21, v33, v21
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_mul_f32_e32 v20, v33, v20
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_mul_f32_e32 v19, v33, v19
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_mul_f32_e32 v18, v33, v18
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v17
-; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v17
-; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v17
-; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v17
-; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v17
-; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v17
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v17
-; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v17
-; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v17
-; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v17
-; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v17
-; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v17
-; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v17
-; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
-; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_mul_f32_e32 v15, v15, v17
-; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_mul_f32_e32 v17, v33, v17
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX9-NEXT:    v_perm_b32 v15, v15, v32, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_mul_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v17
+; GFX10-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v21
-; GFX10-NEXT:    v_mul_f32_e32 v21, v53, v52
-; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_mul_f32_e32 v22, v55, v54
-; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_mul_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v16
+; GFX10-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
 ; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX10-NEXT:    v_mul_f32_e32 v32, v33, v32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_mul_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_mul_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_mul_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX10-NEXT:    v_mul_f32_e32 v34, v35, v34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
-; GFX10-NEXT:    v_mul_f32_e32 v36, v37, v36
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_mul_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_mul_f32_e32 v25, v54, v53
+; GFX10-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_mul_f32_e32 v24, v64, v55
+; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_mul_f32_e32 v23, v66, v65
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_mul_f32_e32 v22, v68, v67
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_mul_f32_e32 v21, v30, v34
+; GFX10-NEXT:    v_mul_f32_e32 v29, v29, v36
+; GFX10-NEXT:    v_mul_f32_e32 v28, v28, v38
+; GFX10-NEXT:    v_mul_f32_e32 v27, v27, v48
+; GFX10-NEXT:    v_mul_f32_e32 v26, v26, v50
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
-; GFX10-NEXT:    v_mul_f32_e32 v38, v39, v38
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
-; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
-; GFX10-NEXT:    v_mul_f32_e32 v48, v49, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
-; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX10-NEXT:    v_mul_f32_e32 v50, v51, v50
-; GFX10-NEXT:    v_mul_f32_e32 v23, v65, v64
-; GFX10-NEXT:    v_mul_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_mul_f32_e32 v24, v67, v66
-; GFX10-NEXT:    v_mul_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_mul_f32_e32 v25, v33, v68
-; GFX10-NEXT:    v_mul_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_mul_f32_e32 v16, v35, v16
-; GFX10-NEXT:    v_mul_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_mul_f32_e32 v17, v37, v17
-; GFX10-NEXT:    v_mul_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_mul_f32_e32 v18, v39, v18
-; GFX10-NEXT:    v_mul_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_mul_f32_e32 v19, v49, v19
-; GFX10-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v27, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v26, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v28, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v29, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
-; GFX10-NEXT:    v_mul_f32_e32 v20, v20, v21
-; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v22
-; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX10-NEXT:    v_mul_f32_e32 v16, v32, v16
+; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_dual_mul_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
+; GFX11-NEXT:    v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v18 :: v_dual_mul_f32 v3, v3, v19
+; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
+; GFX11-NEXT:    v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
 ; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX11-NEXT:    v_mul_f32_e32 v26, v52, v51
+; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT:    v_mul_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
 ; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
 ; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_mul_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
-; GFX11-NEXT:    v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_mul_f32 v9, v9, v25
-; GFX11-NEXT:    v_mul_f32_e32 v25, v69, v68
-; GFX11-NEXT:    v_dual_mul_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
-; GFX11-NEXT:    v_mul_f32_e32 v27, v81, v80
-; GFX11-NEXT:    v_mul_f32_e32 v12, v12, v28
-; GFX11-NEXT:    v_dual_mul_f32 v28, v83, v82 :: v_dual_mul_f32 v29, v85, v84
-; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_mul_f32_e32 v22, v55, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_mul_f32_e32 v24, v64, v55
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_mul_f32_e32 v23, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_dual_mul_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_mul_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_mul_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_mul_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_mul_f32_e32 v37, v86, v85
+; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
 ; GFX11-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX11-NEXT:    v_dual_mul_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_mul_f32_e32 v23, v65, v64
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_dual_mul_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
-; GFX11-NEXT:    v_mul_f32_e32 v18, v39, v38
-; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_mul_f32_e32 v19, v49, v48
-; GFX11-NEXT:    v_mul_f32_e32 v17, v37, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX11-NEXT:    v_dual_mul_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_mul_f32_e32 v21, v53, v52
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX11-NEXT:    v_mul_f32_e32 v16, v35, v34
-; GFX11-NEXT:    v_mul_f32_e32 v32, v33, v32
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
-; GFX11-NEXT:    v_mul_f32_e32 v15, v15, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33
+; GFX11-NEXT:    v_dual_mul_f32 v34, v80, v71 :: v_dual_mul_f32 v35, v82, v81
+; GFX11-NEXT:    v_mul_f32_e32 v36, v84, v83
+; GFX11-NEXT:    v_dual_mul_f32 v16, v32, v16 :: v_dual_mul_f32 v15, v15, v17
+; GFX11-NEXT:    v_perm_b32 v0, v0, v37, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v35, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v36, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v34, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <32 x bfloat> %a, %b
@@ -13396,83 +13280,52 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_min_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_min_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_min_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_minnum_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_min_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
 }
@@ -13539,82 +13392,81 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_min_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_min_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_min_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_min_f32_e32 v5, v7, v6
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_min_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_min_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_min_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_min_f32_e32 v5, v7, v6
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %op
@@ -13730,138 +13582,138 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_min_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_min_f32_e32 v6, v9, v6
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_min_f32_e32 v4, v8, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_min_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_min_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_min_f32_e32 v7, v9, v7
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_min_f32_e32 v4, v8, v4
+; GFX9-NEXT:    v_min_f32_e32 v6, v9, v6
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_min_f32_e32 v5, v9, v5
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_min_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_min_f32_e32 v4, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_min_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_min_f32_e32 v5, v10, v9
 ; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_min_f32_e32 v6, v12, v11
 ; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_min_f32 v9, v11, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_min_f32_e32 v4, v11, v10
-; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_min_f32_e32 v5, v10, v9
-; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_min_f32_e32 v6, v12, v11
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX11-NEXT:    v_dual_min_f32 v10, v11, v10 :: v_dual_min_f32 v11, v13, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
   ret <8 x bfloat> %op
@@ -14077,252 +13929,254 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_min_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_min_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_min_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_min_f32_e32 v10, v17, v10
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_min_f32_e32 v8, v16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_min_f32_e32 v3, v3, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v4, v4, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_min_f32_e32 v5, v5, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_min_f32_e32 v6, v6, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_min_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_min_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_min_f32_e32 v14, v17, v14
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_min_f32_e32 v13, v17, v13
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_min_f32_e32 v12, v17, v12
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_min_f32_e32 v11, v17, v11
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_min_f32_e32 v10, v17, v10
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_min_f32_e32 v9, v17, v9
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_min_f32_e32 v8, v16, v8
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v9
-; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v9
-; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v9
-; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v9
-; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v9
-; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v9
-; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
 ; GFX10-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX10-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_min_f32_e32 v14, v19, v18
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_min_f32_e32 v13, v21, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_min_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v18, v20, v19
 ; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_min_f32_e32 v19, v20, v19
+; GFX10-NEXT:    v_min_f32_e32 v20, v22, v21
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX10-NEXT:    v_min_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX10-NEXT:    v_min_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
-; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v10
-; GFX10-NEXT:    v_min_f32_e32 v9, v16, v11
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v11
 ; GFX10-NEXT:    v_min_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_min_f32_e32 v10, v18, v17
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_min_f32_e32 v5, v5, v11
-; GFX10-NEXT:    v_min_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_min_f32_e32 v12, v17, v16
-; GFX10-NEXT:    v_min_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
-; GFX11-NEXT:    v_min_f32_e32 v9, v20, v19
-; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX11-NEXT:    v_min_f32_e32 v8, v18, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_min_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_min_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_min_f32 v9, v16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX11-NEXT:    v_min_f32_e32 v13, v21, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX11-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT:    v_min_f32_e32 v5, v5, v11
-; GFX11-NEXT:    v_min_f32_e32 v11, v13, v12
-; GFX11-NEXT:    v_min_f32_e32 v12, v17, v16
-; GFX11-NEXT:    v_dual_min_f32 v6, v6, v14 :: v_dual_min_f32 v7, v7, v15
-; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
+; GFX11-NEXT:    v_min_f32_e32 v14, v19, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX11-NEXT:    v_dual_min_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_dual_min_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v19, v20, v19 :: v_dual_min_f32 v20, v22, v21
+; GFX11-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
   ret <16 x bfloat> %op
@@ -14858,483 +14712,480 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_minnum_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_min_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_min_f32_e32 v16, v31, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v17
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_min_f32_e32 v3, v3, v17
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_min_f32_e32 v4, v4, v17
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_min_f32_e32 v5, v5, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_min_f32_e32 v6, v6, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_min_f32_e32 v7, v7, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
+; GFX8-NEXT:    v_min_f32_e32 v30, v14, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_min_f32_e32 v14, v32, v14
+; GFX8-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_min_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_min_f32_e32 v28, v32, v28
+; GFX8-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_min_f32_e32 v27, v32, v27
+; GFX8-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_min_f32_e32 v26, v32, v26
+; GFX8-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_min_f32_e32 v8, v8, v18
+; GFX8-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX8-NEXT:    v_min_f32_e32 v25, v32, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT:    v_min_f32_e32 v9, v9, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT:    v_min_f32_e32 v10, v10, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT:    v_min_f32_e32 v11, v11, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT:    v_min_f32_e32 v12, v12, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT:    v_min_f32_e32 v13, v13, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX8-NEXT:    v_min_f32_e32 v14, v14, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v14, 16
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_min_f32_e32 v32, v32, v33
+; GFX8-NEXT:    v_min_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_min_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_min_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_min_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_min_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_min_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_min_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT:    v_min_f32_e32 v15, v15, v17
-; GFX8-NEXT:    v_min_f32_e32 v16, v18, v16
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_min_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_min_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
+; GFX8-NEXT:    v_alignbit_b32 v14, v16, v31, 16
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v32, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minnum_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_min_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_min_f32_e32 v30, v32, v30
+; GFX9-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_min_f32_e32 v29, v32, v29
+; GFX9-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_min_f32_e32 v28, v32, v28
+; GFX9-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_min_f32_e32 v27, v32, v27
+; GFX9-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_min_f32_e32 v26, v32, v26
+; GFX9-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX9-NEXT:    v_min_f32_e32 v25, v32, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_min_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_min_f32_e32 v15, v15, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_min_f32_e32 v24, v33, v24
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_min_f32_e32 v23, v33, v23
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_min_f32_e32 v22, v33, v22
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_min_f32_e32 v21, v33, v21
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_min_f32_e32 v20, v33, v20
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_min_f32_e32 v19, v33, v19
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_min_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_min_f32_e32 v18, v33, v18
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v17
-; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v17
-; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v17
-; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v17
-; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v17
-; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v17
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_min_f32_e32 v8, v8, v17
-; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_min_f32_e32 v9, v9, v17
-; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_min_f32_e32 v10, v10, v17
-; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_min_f32_e32 v11, v11, v17
-; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_min_f32_e32 v12, v12, v17
-; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_min_f32_e32 v13, v13, v17
-; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_min_f32_e32 v14, v14, v17
-; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
-; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_min_f32_e32 v15, v15, v17
-; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_min_f32_e32 v17, v33, v17
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX9-NEXT:    v_perm_b32 v15, v15, v32, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_min_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v17
+; GFX10-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_min_f32_e32 v5, v5, v21
-; GFX10-NEXT:    v_min_f32_e32 v21, v53, v52
-; GFX10-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_min_f32_e32 v22, v55, v54
-; GFX10-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_min_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v16
+; GFX10-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
 ; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX10-NEXT:    v_min_f32_e32 v32, v33, v32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_min_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_min_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX10-NEXT:    v_min_f32_e32 v34, v35, v34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
-; GFX10-NEXT:    v_min_f32_e32 v36, v37, v36
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_min_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_min_f32_e32 v25, v54, v53
+; GFX10-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_min_f32_e32 v24, v64, v55
+; GFX10-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_min_f32_e32 v23, v66, v65
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_min_f32_e32 v22, v68, v67
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_min_f32_e32 v21, v30, v34
+; GFX10-NEXT:    v_min_f32_e32 v29, v29, v36
+; GFX10-NEXT:    v_min_f32_e32 v28, v28, v38
+; GFX10-NEXT:    v_min_f32_e32 v27, v27, v48
+; GFX10-NEXT:    v_min_f32_e32 v26, v26, v50
 ; GFX10-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
-; GFX10-NEXT:    v_min_f32_e32 v38, v39, v38
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
-; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_min_f32_e32 v3, v3, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
-; GFX10-NEXT:    v_min_f32_e32 v48, v49, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
-; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX10-NEXT:    v_min_f32_e32 v50, v51, v50
-; GFX10-NEXT:    v_min_f32_e32 v23, v65, v64
-; GFX10-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_min_f32_e32 v24, v67, v66
-; GFX10-NEXT:    v_min_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_min_f32_e32 v25, v33, v68
-; GFX10-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_min_f32_e32 v16, v35, v16
-; GFX10-NEXT:    v_min_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_min_f32_e32 v17, v37, v17
-; GFX10-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_min_f32_e32 v18, v39, v18
-; GFX10-NEXT:    v_min_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_min_f32_e32 v19, v49, v19
-; GFX10-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v27, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v26, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v28, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v29, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
-; GFX10-NEXT:    v_min_f32_e32 v20, v20, v21
-; GFX10-NEXT:    v_min_f32_e32 v15, v15, v22
-; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX10-NEXT:    v_min_f32_e32 v16, v32, v16
+; GFX10-NEXT:    v_min_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX11-NEXT:    v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
+; GFX11-NEXT:    v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_dual_min_f32 v2, v2, v18 :: v_dual_min_f32 v3, v3, v19
+; GFX11-NEXT:    v_dual_min_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
+; GFX11-NEXT:    v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
 ; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX11-NEXT:    v_min_f32_e32 v26, v52, v51
+; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT:    v_min_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
 ; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
 ; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_min_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
-; GFX11-NEXT:    v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX11-NEXT:    v_dual_min_f32 v8, v8, v24 :: v_dual_min_f32 v9, v9, v25
-; GFX11-NEXT:    v_min_f32_e32 v25, v69, v68
-; GFX11-NEXT:    v_dual_min_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
-; GFX11-NEXT:    v_min_f32_e32 v27, v81, v80
-; GFX11-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX11-NEXT:    v_dual_min_f32 v28, v83, v82 :: v_dual_min_f32 v29, v85, v84
-; GFX11-NEXT:    v_dual_min_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_min_f32_e32 v22, v55, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_dual_min_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_min_f32_e32 v24, v64, v55
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_min_f32_e32 v23, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_dual_min_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_min_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_min_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_min_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_min_f32_e32 v37, v86, v85
+; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
 ; GFX11-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX11-NEXT:    v_dual_min_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_min_f32_e32 v23, v65, v64
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_dual_min_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
-; GFX11-NEXT:    v_min_f32_e32 v18, v39, v38
-; GFX11-NEXT:    v_dual_min_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_min_f32_e32 v19, v49, v48
-; GFX11-NEXT:    v_min_f32_e32 v17, v37, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_min_f32_e32 v21, v53, v52
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX11-NEXT:    v_min_f32_e32 v16, v35, v34
-; GFX11-NEXT:    v_min_f32_e32 v32, v33, v32
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
-; GFX11-NEXT:    v_min_f32_e32 v15, v15, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33
+; GFX11-NEXT:    v_dual_min_f32 v34, v80, v71 :: v_dual_min_f32 v35, v82, v81
+; GFX11-NEXT:    v_min_f32_e32 v36, v84, v83
+; GFX11-NEXT:    v_dual_min_f32 v16, v32, v16 :: v_dual_min_f32 v15, v15, v17
+; GFX11-NEXT:    v_perm_b32 v0, v0, v37, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v35, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v36, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v34, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
@@ -15553,83 +15404,52 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_max_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_max_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_maxnum_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_max_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
 }
@@ -15696,82 +15516,81 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_max_f32_e32 v2, v4, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_max_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v7, v6
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_max_f32_e32 v2, v7, v6
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_max_f32_e32 v2, v7, v6
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_max_f32 v4, v5, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_max_f32_e32 v5, v7, v6
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %op
@@ -15887,138 +15706,138 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_max_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_max_f32_e32 v6, v9, v6
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_max_f32_e32 v4, v8, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v4, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_max_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v7, v9, v7
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v6, v9, v6
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_max_f32_e32 v5, v9, v5
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v4, v8, v4
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
-; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_perm_b32 v3, v3, v4, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_max_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_max_f32_e32 v4, v11, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_max_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_max_f32_e32 v5, v10, v9
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_max_f32_e32 v6, v12, v11
 ; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_max_f32 v9, v11, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_max_f32_e32 v4, v11, v10
-; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_max_f32_e32 v5, v10, v9
-; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_max_f32_e32 v6, v12, v11
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v6, 0x7060302
+; GFX11-NEXT:    v_dual_max_f32 v10, v11, v10 :: v_dual_max_f32 v11, v13, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
   ret <8 x bfloat> %op
@@ -16234,252 +16053,254 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_max_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_max_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_max_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_max_f32_e32 v10, v17, v10
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_max_f32_e32 v8, v16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_max_f32_e32 v3, v3, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v4, v4, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_max_f32_e32 v5, v5, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_max_f32_e32 v6, v6, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v8, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_max_f32_e32 v7, v7, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v14, v17, v14
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v13, v17, v13
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v12, v17, v12
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v11, v17, v11
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v10, v17, v10
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_max_f32_e32 v9, v17, v9
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v8, v16, v8
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v9
-; GFX9-NEXT:    v_perm_b32 v2, v2, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v9
-; GFX9-NEXT:    v_perm_b32 v3, v3, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v9
-; GFX9-NEXT:    v_perm_b32 v4, v4, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v13
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v9
-; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v14
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v9
-; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
-; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v15
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v9
-; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v16bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
 ; GFX10-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX10-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_max_f32_e32 v14, v19, v18
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_max_f32_e32 v13, v21, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_max_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v18, v20, v19
 ; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_max_f32_e32 v19, v20, v19
+; GFX10-NEXT:    v_max_f32_e32 v20, v22, v21
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX10-NEXT:    v_max_f32_e32 v8, v18, v17
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX10-NEXT:    v_max_f32_e32 v9, v20, v19
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v11
-; GFX10-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v10
-; GFX10-NEXT:    v_max_f32_e32 v9, v16, v11
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v11
 ; GFX10-NEXT:    v_max_f32_e32 v4, v4, v12
-; GFX10-NEXT:    v_max_f32_e32 v10, v18, v17
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v11
-; GFX10-NEXT:    v_max_f32_e32 v11, v13, v12
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_max_f32_e32 v12, v17, v16
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v9 :: v_dual_and_b32 v10, 0xffff0000, v11
-; GFX11-NEXT:    v_max_f32_e32 v9, v20, v19
-; GFX11-NEXT:    v_perm_b32 v2, v2, v9, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v10 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX11-NEXT:    v_max_f32_e32 v8, v18, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v8, 0x7060302
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_max_f32 v10, v18, v17 :: v_dual_lshlrev_b32 v17, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v12 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_max_f32 v9, v16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v12 :: v_dual_and_b32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX11-NEXT:    v_max_f32_e32 v13, v21, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_perm_b32 v4, v4, v13, 0x7060302
+; GFX11-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
-; GFX11-NEXT:    v_max_f32_e32 v5, v5, v11
-; GFX11-NEXT:    v_max_f32_e32 v11, v13, v12
-; GFX11-NEXT:    v_max_f32_e32 v12, v17, v16
-; GFX11-NEXT:    v_dual_max_f32 v6, v6, v14 :: v_dual_max_f32 v7, v7, v15
-; GFX11-NEXT:    v_perm_b32 v4, v4, v9, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v11, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v12, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v6, v6, v14 :: v_dual_lshlrev_b32 v17, 16, v11
+; GFX11-NEXT:    v_max_f32_e32 v14, v19, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v17, v18, v17 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX11-NEXT:    v_dual_max_f32 v18, v20, v19 :: v_dual_lshlrev_b32 v19, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v19, v20, v19 :: v_dual_max_f32 v20, v22, v21
+; GFX11-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
   ret <16 x bfloat> %op
@@ -17015,483 +16836,480 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-LABEL: v_maxnum_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_max_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX8-NEXT:    v_max_f32_e32 v30, v14, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_max_f32_e32 v14, v32, v14
+; GFX8-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_max_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_max_f32_e32 v28, v32, v28
+; GFX8-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_max_f32_e32 v27, v32, v27
+; GFX8-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_max_f32_e32 v26, v32, v26
+; GFX8-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX8-NEXT:    v_max_f32_e32 v25, v32, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v14, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_max_f32_e32 v32, v32, v33
+; GFX8-NEXT:    v_max_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_max_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_max_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_max_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_max_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_max_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_max_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_max_f32_e32 v18, v33, v18
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_max_f32_e32 v16, v31, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_max_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v2, v2, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_max_f32_e32 v3, v3, v17
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v3, v3, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_max_f32_e32 v4, v4, v17
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_alignbit_b32 v4, v4, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v21
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX8-NEXT:    v_max_f32_e32 v5, v5, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_max_f32_e32 v6, v6, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_alignbit_b32 v6, v6, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    v_max_f32_e32 v7, v7, v18
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_alignbit_b32 v7, v7, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX8-NEXT:    v_max_f32_e32 v8, v8, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_alignbit_b32 v8, v8, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v25
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX8-NEXT:    v_max_f32_e32 v9, v9, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_alignbit_b32 v9, v9, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX8-NEXT:    v_max_f32_e32 v10, v10, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_alignbit_b32 v10, v10, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v27
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX8-NEXT:    v_max_f32_e32 v11, v11, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_alignbit_b32 v11, v11, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v28
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX8-NEXT:    v_max_f32_e32 v12, v12, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_alignbit_b32 v12, v12, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v29
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX8-NEXT:    v_max_f32_e32 v13, v13, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_alignbit_b32 v13, v13, v16, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v14
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v30
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX8-NEXT:    v_max_f32_e32 v14, v14, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_alignbit_b32 v14, v14, v16, 16
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT:    v_max_f32_e32 v15, v15, v17
-; GFX8-NEXT:    v_max_f32_e32 v16, v18, v16
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v30
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v17, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
+; GFX8-NEXT:    v_alignbit_b32 v14, v16, v31, 16
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v32, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maxnum_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_max_f32_e32 v31, v32, v31
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX9-NEXT:    v_max_f32_e32 v30, v32, v30
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_max_f32_e32 v29, v32, v29
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_max_f32_e32 v28, v32, v28
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
+; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_max_f32_e32 v27, v32, v27
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v9
+; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_max_f32_e32 v26, v32, v26
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX9-NEXT:    v_max_f32_e32 v25, v32, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v31, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; GFX9-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX9-NEXT:    v_max_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v24, v33, v24
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v23, v33, v23
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v22, v33, v22
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v21, v33, v21
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v20, v33, v20
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v19, v33, v19
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_max_f32_e32 v16, v31, v16
+; GFX9-NEXT:    v_max_f32_e32 v18, v33, v18
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_perm_b32 v1, v1, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v17
-; GFX9-NEXT:    v_perm_b32 v2, v2, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v17
-; GFX9-NEXT:    v_perm_b32 v3, v3, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v17
-; GFX9-NEXT:    v_perm_b32 v4, v4, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v17
-; GFX9-NEXT:    v_perm_b32 v5, v5, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v17
-; GFX9-NEXT:    v_perm_b32 v6, v6, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v23
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v17
-; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v17
-; GFX9-NEXT:    v_perm_b32 v8, v8, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v9
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v17
-; GFX9-NEXT:    v_perm_b32 v9, v9, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v10
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v17
-; GFX9-NEXT:    v_perm_b32 v10, v10, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v27
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v17
-; GFX9-NEXT:    v_perm_b32 v11, v11, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v17
-; GFX9-NEXT:    v_perm_b32 v12, v12, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v13
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v29
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v17
-; GFX9-NEXT:    v_perm_b32 v13, v13, v16, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v30
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v30
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v17
-; GFX9-NEXT:    v_perm_b32 v14, v14, v16, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
-; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX9-NEXT:    v_max_f32_e32 v15, v15, v17
-; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v17, v33, v17
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX9-NEXT:    v_perm_b32 v15, v15, v32, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maxnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_max_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v17
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v21
-; GFX10-NEXT:    v_max_f32_e32 v21, v53, v52
-; GFX10-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_max_f32_e32 v22, v55, v54
-; GFX10-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
+; GFX10-NEXT:    v_max_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v16
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
 ; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX10-NEXT:    v_max_f32_e32 v32, v33, v32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
-; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_max_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_max_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v18
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v27
-; GFX10-NEXT:    v_max_f32_e32 v34, v35, v34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v11
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
-; GFX10-NEXT:    v_max_f32_e32 v36, v37, v36
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v12
-; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_max_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_max_f32_e32 v25, v54, v53
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_max_f32_e32 v24, v64, v55
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_max_f32_e32 v23, v66, v65
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_max_f32_e32 v22, v68, v67
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_max_f32_e32 v21, v30, v34
+; GFX10-NEXT:    v_max_f32_e32 v29, v29, v36
+; GFX10-NEXT:    v_max_f32_e32 v28, v28, v38
+; GFX10-NEXT:    v_max_f32_e32 v27, v27, v48
+; GFX10-NEXT:    v_max_f32_e32 v26, v26, v50
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v29
-; GFX10-NEXT:    v_max_f32_e32 v38, v39, v38
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
-; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX10-NEXT:    v_max_f32_e32 v3, v3, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
-; GFX10-NEXT:    v_max_f32_e32 v48, v49, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v14
-; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX10-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX10-NEXT:    v_max_f32_e32 v50, v51, v50
-; GFX10-NEXT:    v_max_f32_e32 v23, v65, v64
-; GFX10-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_max_f32_e32 v24, v67, v66
-; GFX10-NEXT:    v_max_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_max_f32_e32 v25, v33, v68
-; GFX10-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_max_f32_e32 v16, v35, v16
-; GFX10-NEXT:    v_max_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_max_f32_e32 v17, v37, v17
-; GFX10-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_max_f32_e32 v18, v39, v18
-; GFX10-NEXT:    v_max_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_max_f32_e32 v19, v49, v19
-; GFX10-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v34, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v2, v2, v36, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v3, v3, v38, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v4, v4, v48, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v5, v5, v50, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v13, v13, v18, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v14, v14, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v27, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v26, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v28, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v3, v29, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v31
-; GFX10-NEXT:    v_max_f32_e32 v20, v20, v21
-; GFX10-NEXT:    v_max_f32_e32 v15, v15, v22
-; GFX10-NEXT:    v_perm_b32 v15, v15, v20, 0x7060302
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX10-NEXT:    v_max_f32_e32 v16, v32, v16
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v5
-; GFX11-NEXT:    v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v16 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
+; GFX11-NEXT:    v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v18 :: v_dual_max_f32 v3, v3, v19
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v49, 16, v26
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v26, 0xffff0000, v26
+; GFX11-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v37, 16, v28
 ; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX11-NEXT:    v_max_f32_e32 v26, v52, v51
+; GFX11-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
+; GFX11-NEXT:    v_max_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_perm_b32 v5, v5, v22, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
 ; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v31
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
 ; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_and_b32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_max_f32 v26, v71, v70 :: v_dual_lshlrev_b32 v49, 16, v4
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX11-NEXT:    v_dual_max_f32 v8, v8, v24 :: v_dual_max_f32 v9, v9, v25
-; GFX11-NEXT:    v_max_f32_e32 v25, v69, v68
-; GFX11-NEXT:    v_dual_max_f32 v20, v51, v50 :: v_dual_lshlrev_b32 v39, 16, v3
-; GFX11-NEXT:    v_max_f32_e32 v27, v81, v80
-; GFX11-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX11-NEXT:    v_dual_max_f32 v28, v83, v82 :: v_dual_max_f32 v29, v85, v84
-; GFX11-NEXT:    v_dual_max_f32 v6, v6, v22 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_max_f32_e32 v22, v55, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_dual_max_f32 v8, v8, v24 :: v_dual_and_b32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_max_f32_e32 v24, v64, v55
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
+; GFX11-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_max_f32_e32 v23, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_dual_max_f32 v28, v48, v39 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_max_f32 v27, v50, v49 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    v_max_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_max_f32_e32 v37, v86, v85
+; GFX11-NEXT:    v_perm_b32 v6, v6, v23, 0x7060302
 ; GFX11-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_max_f32_e32 v23, v65, v64
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_dual_max_f32 v24, v67, v66 :: v_dual_and_b32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v32, 16, v16
-; GFX11-NEXT:    v_max_f32_e32 v18, v39, v38
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v19 :: v_dual_and_b32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_max_f32_e32 v19, v49, v48
-; GFX11-NEXT:    v_max_f32_e32 v17, v37, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_max_f32_e32 v21, v53, v52
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x7060302
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX11-NEXT:    v_max_f32_e32 v16, v35, v34
-; GFX11-NEXT:    v_max_f32_e32 v32, v33, v32
-; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v20, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v21, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v16, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v32, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v7, v7, v22, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v8, v8, v23, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v9, v9, v24, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v10, v10, v25, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v11, v11, v26, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v12, v12, v27, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v14, v14, v29, 0x7060302
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v16, v86, v16 :: v_dual_and_b32 v17, 0xffff0000, v31
-; GFX11-NEXT:    v_max_f32_e32 v15, v15, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33
+; GFX11-NEXT:    v_dual_max_f32 v34, v80, v71 :: v_dual_max_f32 v35, v82, v81
+; GFX11-NEXT:    v_max_f32_e32 v36, v84, v83
+; GFX11-NEXT:    v_dual_max_f32 v16, v32, v16 :: v_dual_max_f32 v15, v15, v17
+; GFX11-NEXT:    v_perm_b32 v0, v0, v37, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v35, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v36, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v34, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v7, v7, v24, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
@@ -21043,13 +20861,13 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
 ; GFX8-LABEL: v_fptosi_v3bf16_to_v3i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cvt_i32_f32_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16:
@@ -21140,65 +20958,65 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
 ; GFX8-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_cvt_i32_f32_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_i32_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; GFX8-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_cvt_i32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fptosi_v4bf16_to_v4i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fptosi <4 x bfloat> %x to <4 x i16>
   ret <4 x i16> %op
@@ -22538,13 +22356,11 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16:
@@ -22552,11 +22368,10 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16:
@@ -22564,29 +22379,10 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_sitofp_v3i16_to_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 16, v0
-; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 16, v1
-; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i16> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -22629,55 +22425,55 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v0, v3, v0, 16
 ; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v3, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v4i16_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 16, v0
+; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 16, v1
 ; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v3, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i16> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -22813,12 +22609,12 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8-LABEL: v_sitofp_v3i32_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16:
@@ -22875,22 +22671,22 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
@@ -22900,9 +22696,9 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX10-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
@@ -22911,11 +22707,11 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX11-LABEL: v_sitofp_v4i32_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -23330,130 +23126,130 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-LABEL: v_sitofp_v3i64_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v7, v0, v1
-; GFX8-NEXT:    v_ffbh_i32_e32 v6, v1
+; GFX8-NEXT:    v_xor_b32_e32 v7, v4, v5
+; GFX8-NEXT:    v_ffbh_i32_e32 v6, v5
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, -1, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 32, v7
 ; GFX8-NEXT:    v_min_u32_e32 v6, v6, v7
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_xor_b32_e32 v1, v2, v3
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v7, v0
-; GFX8-NEXT:    v_ffbh_i32_e32 v0, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
-; GFX8-NEXT:    v_min_u32_e32 v8, v0, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
-; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
+; GFX8-NEXT:    v_ldexp_f32 v6, v4, v5
+; GFX8-NEXT:    v_xor_b32_e32 v5, v0, v1
+; GFX8-NEXT:    v_ffbh_i32_e32 v4, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
+; GFX8-NEXT:    v_min_u32_e32 v7, v4, v5
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v7, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_xor_b32_e32 v5, v2, v3
+; GFX8-NEXT:    v_ffbh_i32_e32 v4, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
+; GFX8-NEXT:    v_min_u32_e32 v4, v4, v5
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
-; GFX8-NEXT:    v_ldexp_f32 v2, v7, v2
-; GFX8-NEXT:    v_ldexp_f32 v3, v0, v1
-; GFX8-NEXT:    v_xor_b32_e32 v1, v4, v5
-; GFX8-NEXT:    v_ffbh_i32_e32 v0, v5
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
-; GFX8-NEXT:    v_min_u32_e32 v6, v0, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
-; GFX8-NEXT:    v_ldexp_f32 v1, v1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v7
+; GFX8-NEXT:    v_ldexp_f32 v2, v2, v3
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v7, v4, v5
+; GFX9-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
+; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
+; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
 ; GFX9-NEXT:    v_xor_b32_e32 v7, v0, v1
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
 ; GFX9-NEXT:    v_ffbh_i32_e32 v6, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
 ; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
 ; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
 ; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v6
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v7, v0
 ; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT:    v_min_u32_e32 v8, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT:    v_ldexp_f32 v2, v7, v6
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, v4, v5
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v0
-; GFX9-NEXT:    v_ffbh_i32_e32 v0, v5
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
 ; GFX9-NEXT:    v_min_u32_e32 v7, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
-; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v8
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v0
-; GFX9-NEXT:    v_ldexp_f32 v3, v3, v6
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v3, v2, s4
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
+; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
-; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v7, v0, v1
-; GFX10-NEXT:    v_xor_b32_e32 v8, v2, v3
-; GFX10-NEXT:    v_xor_b32_e32 v9, v4, v5
-; GFX10-NEXT:    v_ffbh_i32_e32 v6, v1
-; GFX10-NEXT:    v_ffbh_i32_e32 v10, v3
+; GFX10-NEXT:    v_xor_b32_e32 v8, v4, v5
+; GFX10-NEXT:    v_xor_b32_e32 v9, v2, v3
+; GFX10-NEXT:    v_ffbh_i32_e32 v6, v5
+; GFX10-NEXT:    v_ffbh_i32_e32 v10, v1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
-; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX10-NEXT:    v_ffbh_i32_e32 v11, v5
+; GFX10-NEXT:    v_ffbh_i32_e32 v11, v3
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, -1, v6
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, 32, v7
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v10
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 32, v7
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, -1, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 32, v9
-; GFX10-NEXT:    v_min_u32_e32 v6, v6, v7
-; GFX10-NEXT:    v_min_u32_e32 v7, v10, v8
-; GFX10-NEXT:    v_min_u32_e32 v8, v11, v9
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_min_u32_e32 v7, v10, v7
+; GFX10-NEXT:    v_min_u32_e32 v9, v11, v9
+; GFX10-NEXT:    v_min_u32_e32 v6, v6, v8
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
 ; GFX10-NEXT:    v_or_b32_e32 v2, v5, v4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v4
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v6
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -23579,236 +23375,231 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v9, v0, v1
-; GFX8-NEXT:    v_ffbh_i32_e32 v8, v1
+; GFX8-NEXT:    v_xor_b32_e32 v9, v4, v5
+; GFX8-NEXT:    v_ffbh_i32_e32 v8, v5
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, -1, v8
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 32, v9
 ; GFX8-NEXT:    v_min_u32_e32 v8, v8, v9
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_xor_b32_e32 v5, v6, v7
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v9, v4
+; GFX8-NEXT:    v_ffbh_i32_e32 v4, v7
+; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
+; GFX8-NEXT:    v_min_u32_e32 v10, v4, v5
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v8
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX8-NEXT:    v_ldexp_f32 v5, v9, v6
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v10
+; GFX8-NEXT:    v_xor_b32_e32 v7, v0, v1
+; GFX8-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX8-NEXT:    v_ffbh_i32_e32 v6, v1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, -1, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 32, v7
+; GFX8-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, v2, v3
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v9, v0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v7, v0
 ; GFX8-NEXT:    v_ffbh_i32_e32 v0, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
-; GFX8-NEXT:    v_min_u32_e32 v10, v0, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
+; GFX8-NEXT:    v_min_u32_e32 v8, v0, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_ldexp_f32 v3, v9, v2
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v10
-; GFX8-NEXT:    v_xor_b32_e32 v2, v4, v5
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT:    v_ffbh_i32_e32 v1, v5
-; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -1, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
-; GFX8-NEXT:    v_min_u32_e32 v8, v1, v2
-; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
+; GFX8-NEXT:    v_ldexp_f32 v1, v7, v2
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_xor_b32_e32 v2, v6, v7
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v1
-; GFX8-NEXT:    v_ffbh_i32_e32 v1, v7
-; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -1, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
-; GFX8-NEXT:    v_min_u32_e32 v4, v1, v2
-; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
-; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX8-NEXT:    v_ldexp_f32 v2, v3, v5
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
-; GFX8-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v4, v5, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v9, v0, v1
-; GFX9-NEXT:    v_ffbh_i32_e32 v8, v1
+; GFX9-NEXT:    v_xor_b32_e32 v9, v4, v5
+; GFX9-NEXT:    v_ffbh_i32_e32 v8, v5
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
 ; GFX9-NEXT:    v_add_u32_e32 v8, -1, v8
 ; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
 ; GFX9-NEXT:    v_min_u32_e32 v8, v8, v9
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_xor_b32_e32 v5, v6, v7
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v9, v4
+; GFX9-NEXT:    v_ffbh_i32_e32 v4, v7
+; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
+; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX9-NEXT:    v_min_u32_e32 v10, v4, v5
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX9-NEXT:    v_ldexp_f32 v6, v9, v8
+; GFX9-NEXT:    v_xor_b32_e32 v8, v0, v1
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_ffbh_i32_e32 v7, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_u32_e32 v7, -1, v7
+; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX9-NEXT:    v_min_u32_e32 v7, v7, v8
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v10
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v9, v1, v0
+; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
 ; GFX9-NEXT:    v_ffbh_i32_e32 v0, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT:    v_min_u32_e32 v10, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v9
+; GFX9-NEXT:    v_min_u32_e32 v8, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v8
-; GFX9-NEXT:    v_ldexp_f32 v3, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v10
-; GFX9-NEXT:    v_xor_b32_e32 v2, v4, v5
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_ffbh_i32_e32 v1, v5
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
-; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, 32, v2
-; GFX9-NEXT:    v_min_u32_e32 v8, v1, v2
-; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_or_b32_e32 v3, v2, v1
-; GFX9-NEXT:    v_xor_b32_e32 v2, v6, v7
-; GFX9-NEXT:    v_ffbh_i32_e32 v1, v7
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
-; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, 32, v2
-; GFX9-NEXT:    v_min_u32_e32 v4, v1, v2
-; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
+; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
-; GFX9-NEXT:    v_ldexp_f32 v2, v3, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v4
-; GFX9-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_perm_b32 v1, v4, v6, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v8, v0, v1
-; GFX10-NEXT:    v_ffbh_i32_e32 v9, v1
-; GFX10-NEXT:    v_ffbh_i32_e32 v10, v3
-; GFX10-NEXT:    v_xor_b32_e32 v11, v2, v3
-; GFX10-NEXT:    v_xor_b32_e32 v13, v4, v5
+; GFX10-NEXT:    v_xor_b32_e32 v8, v4, v5
+; GFX10-NEXT:    v_ffbh_i32_e32 v9, v5
+; GFX10-NEXT:    v_xor_b32_e32 v11, v6, v7
+; GFX10-NEXT:    v_xor_b32_e32 v13, v0, v1
+; GFX10-NEXT:    v_xor_b32_e32 v14, v2, v3
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v9
-; GFX10-NEXT:    v_xor_b32_e32 v15, v6, v7
-; GFX10-NEXT:    v_ffbh_i32_e32 v12, v5
-; GFX10-NEXT:    v_ffbh_i32_e32 v14, v7
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_ffbh_i32_e32 v10, v7
+; GFX10-NEXT:    v_ffbh_i32_e32 v12, v1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v10
 ; GFX10-NEXT:    v_add_nc_u32_e32 v12, -1, v12
-; GFX10-NEXT:    v_add_nc_u32_e32 v14, -1, v14
-; GFX10-NEXT:    v_min_u32_e32 v8, v9, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v10
-; GFX10-NEXT:    v_ashrrev_i32_e32 v10, 31, v13
-; GFX10-NEXT:    v_ashrrev_i32_e32 v13, 31, v15
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 32, v11
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, 32, v10
-; GFX10-NEXT:    v_add_nc_u32_e32 v13, 32, v13
-; GFX10-NEXT:    v_min_u32_e32 v9, v9, v11
+; GFX10-NEXT:    v_min_u32_e32 v8, v9, v8
+; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v13
+; GFX10-NEXT:    v_ffbh_i32_e32 v13, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v14, 32, v14
+; GFX10-NEXT:    v_min_u32_e32 v10, v10, v11
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v13, -1, v13
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v10, v[6:7]
+; GFX10-NEXT:    v_min_u32_e32 v9, v12, v9
+; GFX10-NEXT:    v_min_u32_e32 v11, v13, v14
+; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
+; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX10-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX10-NEXT:    v_min_u32_e32 v10, v12, v10
-; GFX10-NEXT:    v_min_u32_e32 v11, v14, v13
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
-; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
 ; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    v_min_u32_e32 v1, 1, v4
-; GFX10-NEXT:    v_min_u32_e32 v4, 1, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 32, v10
+; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v4
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
-; GFX10-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX10-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, v5
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
-; GFX10-NEXT:    v_sub_nc_u32_e32 v7, 32, v11
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v5
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX10-NEXT:    v_ldexp_f32 v3, v4, v7
-; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v6
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_v4i64_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v8, v0, v1
-; GFX11-NEXT:    v_cls_i32_e32 v9, v1
-; GFX11-NEXT:    v_cls_i32_e32 v10, v3
-; GFX11-NEXT:    v_xor_b32_e32 v11, v2, v3
-; GFX11-NEXT:    v_xor_b32_e32 v13, v4, v5
+; GFX11-NEXT:    v_xor_b32_e32 v8, v4, v5
+; GFX11-NEXT:    v_cls_i32_e32 v9, v5
+; GFX11-NEXT:    v_xor_b32_e32 v11, v6, v7
+; GFX11-NEXT:    v_xor_b32_e32 v13, v0, v1
+; GFX11-NEXT:    v_xor_b32_e32 v14, v2, v3
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
 ; GFX11-NEXT:    v_add_nc_u32_e32 v9, -1, v9
-; GFX11-NEXT:    v_xor_b32_e32 v15, v6, v7
-; GFX11-NEXT:    v_cls_i32_e32 v12, v5
-; GFX11-NEXT:    v_cls_i32_e32 v14, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX11-NEXT:    v_cls_i32_e32 v10, v7
+; GFX11-NEXT:    v_cls_i32_e32 v12, v1
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX11-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v10
 ; GFX11-NEXT:    v_add_nc_u32_e32 v12, -1, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, -1, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_min_u32_e32 v8, v9, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, -1, v10
-; GFX11-NEXT:    v_ashrrev_i32_e32 v10, 31, v13
-; GFX11-NEXT:    v_ashrrev_i32_e32 v13, 31, v15
 ; GFX11-NEXT:    v_add_nc_u32_e32 v11, 32, v11
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX11-NEXT:    v_min_u32_e32 v8, v9, v8
+; GFX11-NEXT:    v_ashrrev_i32_e32 v9, 31, v13
+; GFX11-NEXT:    v_cls_i32_e32 v13, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 32, v14
+; GFX11-NEXT:    v_min_u32_e32 v10, v10, v11
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, -1, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v10, v[6:7]
+; GFX11-NEXT:    v_min_u32_e32 v9, v12, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_u32_e32 v11, v13, v14
+; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 32, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 32, v13
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_u32_e32 v9, v9, v11
+; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
 ; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_u32_e32 v10, v12, v10
-; GFX11-NEXT:    v_min_u32_e32 v11, v14, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_u32_e32 v1, 1, v4
-; GFX11-NEXT:    v_min_u32_e32 v4, 1, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v7, 32, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v4
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
-; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
-; GFX11-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, v5
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
-; GFX11-NEXT:    v_sub_nc_u32_e32 v7, 32, v11
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, v4
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v3
-; GFX11-NEXT:    v_ldexp_f32 v2, v2, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX11-NEXT:    v_ldexp_f32 v3, v4, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i64> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -23961,13 +23752,11 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -23975,11 +23764,10 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -23987,29 +23775,10 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_uitofp_v3i16_to_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i16> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -24052,55 +23821,55 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX8-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v0, v3, v0, 16
 ; GFX8-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v4i16_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i16> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -24236,12 +24005,12 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8-LABEL: v_uitofp_v3i32_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16:
@@ -24298,22 +24067,22 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
@@ -24323,9 +24092,9 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX10-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
@@ -24334,11 +24103,11 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX11-LABEL: v_uitofp_v4i32_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -24653,65 +24422,65 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-LABEL: v_uitofp_v3i64_to_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_ffbh_u32_e32 v6, v1
+; GFX8-NEXT:    v_ffbh_u32_e32 v6, v5
 ; GFX8-NEXT:    v_min_u32_e32 v6, 32, v6
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v7, v0
-; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX8-NEXT:    v_min_u32_e32 v8, 32, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
-; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v6
+; GFX8-NEXT:    v_ldexp_f32 v6, v4, v5
+; GFX8-NEXT:    v_ffbh_u32_e32 v4, v1
+; GFX8-NEXT:    v_min_u32_e32 v7, 32, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v7, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX8-NEXT:    v_min_u32_e32 v0, 1, v4
+; GFX8-NEXT:    v_ffbh_u32_e32 v4, v3
+; GFX8-NEXT:    v_min_u32_e32 v4, 32, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
-; GFX8-NEXT:    v_ldexp_f32 v2, v7, v2
-; GFX8-NEXT:    v_ldexp_f32 v3, v0, v1
-; GFX8-NEXT:    v_ffbh_u32_e32 v0, v5
-; GFX8-NEXT:    v_min_u32_e32 v6, 32, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
-; GFX8-NEXT:    v_ldexp_f32 v1, v1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v7
+; GFX8-NEXT:    v_ldexp_f32 v2, v2, v3
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_ffbh_u32_e32 v6, v5
+; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
 ; GFX9-NEXT:    v_ffbh_u32_e32 v6, v1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
 ; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
-; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v6
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, v0
+; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT:    v_min_u32_e32 v8, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT:    v_ldexp_f32 v2, v7, v6
-; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v0
-; GFX9-NEXT:    v_ffbh_u32_e32 v0, v5
 ; GFX9-NEXT:    v_min_u32_e32 v7, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
-; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v8
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v0
-; GFX9-NEXT:    v_ldexp_f32 v3, v3, v6
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v3, v2, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
+; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
-; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16:
@@ -24726,21 +24495,21 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
 ; GFX10-NEXT:    v_or_b32_e32 v2, v5, v4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v7
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v6
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v4
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v8
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -24834,129 +24603,129 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_ffbh_u32_e32 v8, v1
+; GFX8-NEXT:    v_ffbh_u32_e32 v8, v5
 ; GFX8-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v9, v4
+; GFX8-NEXT:    v_ffbh_u32_e32 v4, v7
+; GFX8-NEXT:    v_min_u32_e32 v10, 32, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v8
+; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX8-NEXT:    v_ldexp_f32 v5, v9, v6
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v10
+; GFX8-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX8-NEXT:    v_ffbh_u32_e32 v6, v1
+; GFX8-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v9, v0
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v7, v0
 ; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX8-NEXT:    v_min_u32_e32 v10, 32, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
+; GFX8-NEXT:    v_min_u32_e32 v8, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v10
-; GFX8-NEXT:    v_ldexp_f32 v3, v9, v2
-; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX8-NEXT:    v_ffbh_u32_e32 v1, v5
-; GFX8-NEXT:    v_min_u32_e32 v8, 32, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
+; GFX8-NEXT:    v_ldexp_f32 v1, v7, v2
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; GFX8-NEXT:    v_ffbh_u32_e32 v1, v7
-; GFX8-NEXT:    v_min_u32_e32 v4, 32, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 32, v8
-; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX8-NEXT:    v_ldexp_f32 v2, v3, v5
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v4
-; GFX8-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v1, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v4, v5, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ffbh_u32_e32 v8, v1
+; GFX9-NEXT:    v_ffbh_u32_e32 v8, v5
 ; GFX9-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX9-NEXT:    v_sub_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, v4
+; GFX9-NEXT:    v_ffbh_u32_e32 v4, v7
+; GFX9-NEXT:    v_min_u32_e32 v10, 32, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX9-NEXT:    v_ffbh_u32_e32 v7, v1
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX9-NEXT:    v_min_u32_e32 v7, 32, v7
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v10
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v9, v1, v0
+; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, v1, v0
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT:    v_min_u32_e32 v10, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v10, v[2:3]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v9
+; GFX9-NEXT:    v_ldexp_f32 v6, v9, v8
+; GFX9-NEXT:    v_min_u32_e32 v8, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v8
-; GFX9-NEXT:    v_ldexp_f32 v3, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v10
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    v_ffbh_u32_e32 v1, v5
-; GFX9-NEXT:    v_min_u32_e32 v8, 32, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v8, v[4:5]
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_or_b32_e32 v3, v2, v1
-; GFX9-NEXT:    v_ffbh_u32_e32 v1, v7
-; GFX9-NEXT:    v_min_u32_e32 v4, 32, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[1:2], v4, v[6:7]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX9-NEXT:    v_min_u32_e32 v1, 1, v1
-; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
+; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
-; GFX9-NEXT:    v_ldexp_f32 v2, v3, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, 32, v4
-; GFX9-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    v_perm_b32 v1, v4, v6, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v8, v1
-; GFX10-NEXT:    v_ffbh_u32_e32 v9, v3
-; GFX10-NEXT:    v_ffbh_u32_e32 v10, v5
+; GFX10-NEXT:    v_ffbh_u32_e32 v8, v5
+; GFX10-NEXT:    v_ffbh_u32_e32 v9, v1
+; GFX10-NEXT:    v_ffbh_u32_e32 v10, v3
 ; GFX10-NEXT:    v_ffbh_u32_e32 v11, v7
 ; GFX10-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX10-NEXT:    v_min_u32_e32 v9, 32, v9
 ; GFX10-NEXT:    v_min_u32_e32 v10, 32, v10
 ; GFX10-NEXT:    v_min_u32_e32 v11, 32, v11
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
+; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX10-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX10-NEXT:    v_or_b32_e32 v2, v5, v4
-; GFX10-NEXT:    v_or_b32_e32 v4, v7, v6
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 32, v9
+; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_or_b32_e32 v3, v7, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v11
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v4
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v5, 32, v10
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v4
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v8
-; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX10-NEXT:    v_ldexp_f32 v2, v2, v5
-; GFX10-NEXT:    v_ldexp_f32 v3, v4, v6
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX10-NEXT:    v_ldexp_f32 v4, v4, v8
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, v6
+; GFX10-NEXT:    v_ldexp_f32 v1, v3, v1
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v4i64_to_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v8, v1
-; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v3
-; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v5
+; GFX11-NEXT:    v_clz_i32_u32_e32 v8, v5
+; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v1
+; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v3
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v11, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_u32_e32 v8, 32, v8
@@ -24965,37 +24734,37 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX11-NEXT:    v_min_u32_e32 v10, 32, v10
 ; GFX11-NEXT:    v_min_u32_e32 v11, 32, v11
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v10, v[4:5]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v10, v[2:3]
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
+; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v6
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v9
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v10
-; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v4
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v8
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
-; GFX11-NEXT:    v_ldexp_f32 v2, v2, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_ldexp_f32 v3, v4, v6
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v7, v6
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v11
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v4
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v9
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v10
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX11-NEXT:    v_ldexp_f32 v4, v4, v8
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, v6
+; GFX11-NEXT:    v_ldexp_f32 v1, v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i64> %x to <4 x bfloat>
   ret <4 x bfloat> %op
@@ -25672,18 +25441,14 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v3bf16:
@@ -25692,51 +25457,25 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v6, v5 :: v_dual_cndmask_b32 v2, v4, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
   ret <3 x bfloat> %op
@@ -25794,18 +25533,18 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v4bf16:
@@ -25814,51 +25553,25 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v6, v5 :: v_dual_cndmask_b32 v2, v4, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
   ret <4 x bfloat> %op
@@ -25932,24 +25645,24 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v6bf16:
@@ -25958,65 +25671,28 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v6bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v2, v5, v3, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v6bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v3, v6, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v5, v2 :: v_dual_cndmask_b32 v5, v10, v9
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v5, v3, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b
   ret <6 x bfloat> %op
@@ -26106,30 +25782,30 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v8bf16:
@@ -26138,80 +25814,30 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v2, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v2, v5, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v2, v6, v3, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v3, v7, v4, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v5, v6, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v7, v12, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v1, v2, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v6, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v7, v4, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v1, v6, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v7, v3 :: v_dual_cndmask_b32 v3, v8, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b
   ret <8 x bfloat> %op
@@ -26373,54 +25999,54 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v14, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v13, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v11, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v11
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
+; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v18, v17, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v16
+; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v17
+; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v16bf16:
@@ -26429,140 +26055,40 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat>
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
-; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
-; GFX9-NEXT:    v_perm_b32 v4, v5, v4, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
-; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
-; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v18, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v19, v0, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
-; GFX10-NEXT:    v_perm_b32 v0, v9, v1, 0x5040100
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GFX10-NEXT:    v_perm_b32 v1, v10, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v18, v17, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v17, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v14, v6, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v15
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v14, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v4, v9, v5, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v5, v6, v10, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v6, v11, v7, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v7, v12, v8, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v16, v8, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v18, v17, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v19, v0, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
-; GFX11-NEXT:    v_perm_b32 v0, v9, v1, 0x5040100
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GFX11-NEXT:    v_perm_b32 v1, v10, v2, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v18, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v12, v4 :: v_dual_cndmask_b32 v4, v10, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v9, v17, v11 :: v_dual_cndmask_b32 v10, v14, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v15, v7 :: v_dual_cndmask_b32 v8, v16, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v17, v14, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v9, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v6, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v11, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v12, v8, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v9, v1 :: v_dual_cndmask_b32 v1, v10, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v12, v4
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v13, v5 :: v_dual_cndmask_b32 v5, v14, v6
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v15, v7 :: v_dual_cndmask_b32 v7, v16, v8
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b
   ret <16 x bfloat> %op
@@ -26980,106 +26506,106 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v17, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v17, v4, vcc
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v21
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v18, v5, vcc
-; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v22
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v19, v6, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v19, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v25
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v19, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v26
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v19, v10, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v27
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v28
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v19, v12, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v30
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v19, v14, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; GFX8-NEXT:    v_cndmask_b32_e32 v30, v30, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX8-NEXT:    v_lshrrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v29, v29, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v28, v28, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v27, v27, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v26, v26, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v25, v25, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v24, v24, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v23, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v31, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v22, v0, vcc
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v0, v16, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v33, v0, v33, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX8-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v32, v15, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX8-NEXT:    v_cndmask_b32_e32 v32, v32, v0, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v21, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v20, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v19, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v18, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
+; GFX8-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v21
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v22
+; GFX8-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v23
+; GFX8-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v24
+; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v25
+; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v26
+; GFX8-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v27
+; GFX8-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v28
+; GFX8-NEXT:    v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v29
+; GFX8-NEXT:    v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v30
+; GFX8-NEXT:    v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v31
+; GFX8-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v32
+; GFX8-NEXT:    v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v33
+; GFX8-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_select_v32bf16:
@@ -27088,90 +26614,25 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v17, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v17, v4, vcc
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v21
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v18, v5, vcc
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    v_perm_b32 v4, v5, v4, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v22
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v19, v6, vcc
-; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v19, v7, vcc
-; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v19, v8, vcc
-; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v25
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v19, v9, vcc
-; GFX9-NEXT:    v_perm_b32 v8, v9, v8, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v26
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v19, v10, vcc
-; GFX9-NEXT:    v_perm_b32 v9, v10, v9, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v27
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
-; GFX9-NEXT:    v_perm_b32 v10, v11, v10, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v28
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v19, v12, vcc
-; GFX9-NEXT:    v_perm_b32 v11, v12, v11, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
-; GFX9-NEXT:    v_perm_b32 v12, v13, v12, s4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v30
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v19, v14, vcc
-; GFX9-NEXT:    v_perm_b32 v13, v14, v13, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v15, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc
-; GFX9-NEXT:    v_perm_b32 v14, v15, v14, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v15, v18, v16, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
-; GFX9-NEXT:    v_perm_b32 v15, v16, v15, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_select_v32bf16:
@@ -27181,89 +26642,25 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v24
-; GFX10-NEXT:    v_lshrrev_b32_e32 v65, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX10-NEXT:    v_lshrrev_b32_e32 v66, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v67, 16, v26
-; GFX10-NEXT:    v_lshrrev_b32_e32 v68, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v33, v34, v33, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v35, v36, v35, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v37, v38, v37, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v39, v48, v39, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v49, v50, v49, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v52, v51, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v54, v53, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v28
-; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v30
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
-; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, v64, v55, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v25, v0, v65, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v26, v67, v66, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v68, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v33, v1, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v35, v2, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v2, v37, v3, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v3, v39, v4, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v4, v49, v5, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v5, v22, v6, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v6, v23, v7, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v18, v34, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v19, v36, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v20, v20, v38, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v7, v24, v8, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v8, v25, v9, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v9, v26, v10, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v10, v17, v11, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v11, v18, v12, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v12, v19, v13, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v13, v20, v14, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v17, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v18, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v19, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v20, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v21, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v22, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v23, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v24, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v25, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v26, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v27, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v28, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v29, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v30, v14, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v31
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v31, v15, vcc_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v32
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v32, v16, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v22, v48, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v23, v21, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v14, v17, v15, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v15, v21, v16, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v32, v16, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_select_v32bf16:
@@ -27273,80 +26670,17 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v28
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v34, v33, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v36, v35, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v16
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v26, v10 :: v_dual_cndmask_b32 v11, v27, v11
-; GFX11-NEXT:    v_dual_cndmask_b32 v27, v70, v69 :: v_dual_cndmask_b32 v12, v28, v12
-; GFX11-NEXT:    v_dual_cndmask_b32 v28, v80, v71 :: v_dual_cndmask_b32 v13, v29, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v29, v82, v81, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v0, v83, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v17, v1, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v18, v2, 0x5040100
-; GFX11-NEXT:    v_dual_cndmask_b32 v19, v38, v37 :: v_dual_cndmask_b32 v4, v20, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v20, v48, v39 :: v_dual_cndmask_b32 v5, v21, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v50, v49, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v22, v6 :: v_dual_cndmask_b32 v7, v23, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v22, v52, v51 :: v_dual_cndmask_b32 v23, v54, v53
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v24, v64, v55 :: v_dual_cndmask_b32 v9, v25, v9
-; GFX11-NEXT:    v_dual_cndmask_b32 v25, v66, v65 :: v_dual_cndmask_b32 v26, v68, v67
-; GFX11-NEXT:    v_perm_b32 v2, v19, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v20, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v21, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v22, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v23, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v24, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v25, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v26, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v10, v27, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v11, v28, v12, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v12, v29, v13, 0x5040100
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v31
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v17, v1 :: v_dual_cndmask_b32 v1, v18, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v19, v3 :: v_dual_cndmask_b32 v3, v20, v4
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v21, v5 :: v_dual_cndmask_b32 v5, v22, v6
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v23, v7 :: v_dual_cndmask_b32 v7, v24, v8
+; GFX11-NEXT:    v_dual_cndmask_b32 v8, v25, v9 :: v_dual_cndmask_b32 v9, v26, v10
+; GFX11-NEXT:    v_dual_cndmask_b32 v10, v27, v11 :: v_dual_cndmask_b32 v11, v28, v12
+; GFX11-NEXT:    v_dual_cndmask_b32 v12, v29, v13 :: v_dual_cndmask_b32 v13, v30, v14
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v32
-; GFX11-NEXT:    v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v16, v32, v16
-; GFX11-NEXT:    v_perm_b32 v13, v30, v14, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v17, v17, v84 :: v_dual_cndmask_b32 v18, v18, v85
-; GFX11-NEXT:    v_perm_b32 v14, v17, v15, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v15, v18, v16, 0x5040100
+; GFX11-NEXT:    v_dual_cndmask_b32 v14, v31, v15 :: v_dual_cndmask_b32 v15, v32, v16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b
   ret <32 x bfloat> %op
@@ -27413,10 +26747,17 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s2, s3, 16
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
@@ -27424,17 +26765,10 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 ;
 ; GFX9-LABEL: s_select_v3bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -27445,39 +26779,28 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 ;
 ; GFX10-LABEL: s_select_v3bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s1
-; GFX10-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, s2, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s3, v0, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s3, v2, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_select_v3bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX11-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, s2, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s3, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %cond = icmp eq i32 %c, 0
   %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b
@@ -27582,73 +26905,40 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_select_v4bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-LABEL: s_select_v4bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s1, 0x5040100
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_select_v4bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX10-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX10-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, s3, v3, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, s2, v2, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_select_v4bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s6, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s6
+; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s1
-; GFX11-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, s3, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, s2, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %cond = icmp eq i32 %c, 0
   %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b
@@ -27887,96 +27177,95 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
 ; GFX8-LABEL: v_vselect_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_vselect_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v7, v5 :: v_dual_and_b32 v3, 1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v3, 1, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v1, 1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -28062,172 +27351,170 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
 ; GFX8-LABEL: v_vselect_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v17, v16, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v15, v11, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v10, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v9, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v12, v8, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v13, v9, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v13
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v14, v10, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v15, v11, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v11
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v15
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v13
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v9, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v9, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v13
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v14, v10, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v15, v11, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v15
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
+; GFX9-NEXT:    v_perm_b32 v2, v5, v4, s4
+; GFX9-NEXT:    v_perm_b32 v3, v7, v6, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v10
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
 ; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v17, v16, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v12, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v15
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_vselect_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v15, v11 :: v_dual_and_b32 v1, 1, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v3, 1, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v17, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v13, v9 :: v_dual_and_b32 v7, 1, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v12, v8 :: v_dual_and_b32 v7, 1, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v17, v16 :: v_dual_and_b32 v6, 1, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v12, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v13, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v5, 1, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v9, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v12, v10, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b
@@ -28469,244 +27756,259 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX8-LABEL: v_vselect_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v24
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v16, v1, s[4:5]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v25, v17, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v4
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v5
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v26
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v26, v18, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v6
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v7
-; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v9
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v10
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX8-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v12
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v13
+; GFX8-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v22
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v30
+; GFX8-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v11
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v14
+; GFX8-NEXT:    v_and_b32_e32 v5, 1, v15
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v3, v2, s[28:29]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v20
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v28
+; GFX8-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v3, v2, s[20:21]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v21
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v29
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v4, s[24:25]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v27
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[4:5]
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v27, v19, vcc
-; GFX8-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v11
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v28, v20, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v28
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v29, v21, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v29
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v27
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v9, v4, s[16:17]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v30, v22, s[26:27]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v27, v19, s[14:15]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v29, v21, s[22:23]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v28, v20, s[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v26, v18, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v25, v17, s[6:7]
+; GFX8-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX8-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v30, v22, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v30
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
-; GFX8-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
-; GFX8-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v7, v23, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v0, v23, s[30:31]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v2, v1, s[34:35]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v26
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[12:13]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[8:9]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v15, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v24, v16, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v13
+; GFX8-NEXT:    v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v7, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_readlane_b32 s35, v31, 3
+; GFX8-NEXT:    v_readlane_b32 s34, v31, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v31, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v14
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v15
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v4
+; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX9-NEXT:    v_and_b32_e32 v12, 1, v13
+; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v10
+; GFX9-NEXT:    v_and_b32_e32 v10, 1, v11
+; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v6
+; GFX9-NEXT:    v_and_b32_e32 v6, 1, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v30
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v10
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v8
+; GFX9-NEXT:    v_and_b32_e32 v8, 1, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s[4:5]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v29
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[8:9]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v28
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v11, v9, s[12:13]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v27
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v13, v11, s[16:17]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v23
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v30, v22, vcc
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v26, v18, s[18:19]
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v29, v21, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v28, v20, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v27, v19, s[14:15]
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s4
+; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v4, v23, s[20:21]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v4, v13, s[22:23]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v26
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v18, v4, s[24:25]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v18, v17, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v24
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v16, v1, s[4:5]
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v24
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v25, v17, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s6
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v4
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v5
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v26
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v26, v18, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[4:5]
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s6
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v6
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v7
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v27
-; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v27, v19, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[4:5]
-; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX9-NEXT:    v_and_b32_e32 v5, 1, v11
-; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v28, v20, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v28
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX9-NEXT:    v_perm_b32 v4, v8, v4, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v29, v21, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v21
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v29
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
-; GFX9-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX9-NEXT:    v_perm_b32 v5, v5, v8, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v30, v22, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v30
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
-; GFX9-NEXT:    v_perm_b32 v6, v6, v8, s6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v23, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX9-NEXT:    v_perm_b32 v7, v7, v8, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v16, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
+; GFX9-NEXT:    v_perm_b32 v2, v4, v15, s4
+; GFX9-NEXT:    v_perm_b32 v3, v11, v12, s4
+; GFX9-NEXT:    v_perm_b32 v4, v9, v10, s4
+; GFX9-NEXT:    v_perm_b32 v7, v13, v14, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v34, v33, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v25
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v36, v35, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v24
 ; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v28
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v35, v34, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
 ; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
-; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v37, v36, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v38, v37, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
 ; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v26, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v52, v51, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v54, v53, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v50, v49, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v39, v38, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v49, v48, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v48, v39, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
 ; GFX10-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v51, v50, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX10-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, v53, v52, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v31, v23, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v3, v54, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v3, v32, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
@@ -28716,78 +28018,79 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
+; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
+; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v30
+; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
+; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11-NEXT:    v_dual_cndmask_b32 v12, v30, v22 :: v_dual_and_b32 v11, 1, v11
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
+; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v34, v33, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v24
+; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
 ; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v24, v16 :: v_dual_and_b32 v3, 1, v3
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v36, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
 ; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v35, v34, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v28
+; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v25
 ; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v38, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
 ; GFX11-NEXT:    v_dual_cndmask_b32 v4, v26, v18 :: v_dual_and_b32 v7, 1, v7
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v52, v51, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v54, v53, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v37, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v50, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v39, v38 :: v_dual_and_b32 v8, 1, v8
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v49, v48, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v48, v39, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v51, v50, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX11-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v30, v22, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v53, v52, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
+; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, v31, v23, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v3, v54, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v3, v32, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b
@@ -29252,171 +28555,235 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_writelane_b32 v31, s30, 0
 ; GFX8-NEXT:    v_writelane_b32 v31, s31, 1
 ; GFX8-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v1
 ; GFX8-NEXT:    v_writelane_b32 v31, s36, 4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX8-NEXT:    v_writelane_b32 v31, s37, 5
-; GFX8-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v3
 ; GFX8-NEXT:    v_writelane_b32 v31, s38, 6
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v21
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v18
-; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
-; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX8-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v4
 ; GFX8-NEXT:    v_writelane_b32 v31, s39, 7
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v17
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v16
-; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:72
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
-; GFX8-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX8-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v5
 ; GFX8-NEXT:    v_writelane_b32 v31, s40, 8
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v15
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v14
-; GFX8-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:76
-; GFX8-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:12
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v6
 ; GFX8-NEXT:    v_writelane_b32 v31, s41, 9
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v7
 ; GFX8-NEXT:    v_writelane_b32 v31, s42, 10
-; GFX8-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX8-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v8
 ; GFX8-NEXT:    v_writelane_b32 v31, s43, 11
-; GFX8-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v13
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v12
-; GFX8-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:80
-; GFX8-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:16
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v20
-; GFX8-NEXT:    buffer_load_ushort v20, off, s[0:3], s32
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v9
 ; GFX8-NEXT:    v_writelane_b32 v31, s44, 12
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v10
 ; GFX8-NEXT:    v_writelane_b32 v31, s45, 13
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v11
 ; GFX8-NEXT:    v_writelane_b32 v31, s46, 14
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v12
 ; GFX8-NEXT:    v_writelane_b32 v31, s47, 15
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v13
 ; GFX8-NEXT:    v_writelane_b32 v31, s48, 16
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v14
 ; GFX8-NEXT:    v_writelane_b32 v31, s49, 17
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v15
 ; GFX8-NEXT:    v_writelane_b32 v31, s50, 18
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v16
 ; GFX8-NEXT:    v_writelane_b32 v31, s51, 19
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v17
 ; GFX8-NEXT:    v_writelane_b32 v31, s52, 20
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v18
 ; GFX8-NEXT:    v_writelane_b32 v31, s53, 21
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v19
 ; GFX8-NEXT:    v_writelane_b32 v31, s54, 22
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v20
 ; GFX8-NEXT:    v_writelane_b32 v31, s55, 23
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v21
 ; GFX8-NEXT:    v_writelane_b32 v31, s56, 24
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v22
 ; GFX8-NEXT:    v_writelane_b32 v31, s57, 25
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v23
 ; GFX8-NEXT:    v_writelane_b32 v31, s58, 26
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v24
 ; GFX8-NEXT:    v_writelane_b32 v31, s59, 27
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v25
 ; GFX8-NEXT:    v_writelane_b32 v31, s60, 28
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v26
 ; GFX8-NEXT:    v_writelane_b32 v31, s61, 29
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v27
 ; GFX8-NEXT:    v_writelane_b32 v31, s62, 30
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v28
 ; GFX8-NEXT:    v_writelane_b32 v31, s63, 31
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v29
 ; GFX8-NEXT:    v_writelane_b32 v31, s64, 32
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v30
 ; GFX8-NEXT:    v_writelane_b32 v31, s65, 33
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v8
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v7
-; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:84
-; GFX8-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v0
+; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX8-NEXT:    v_writelane_b32 v31, s66, 34
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v2
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_writelane_b32 v31, s67, 35
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[66:67], 1, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v6
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v5
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v4
-; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:88
-; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX8-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX8-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v10
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v9
-; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:92
-; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
-; GFX8-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX8-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX8-NEXT:    v_and_b32_e32 v19, 1, v19
-; GFX8-NEXT:    s_waitcnt vmcnt(14)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX8-NEXT:    s_waitcnt vmcnt(13)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[64:65]
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v18, v21, s[66:67]
-; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    s_waitcnt vmcnt(13)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; GFX8-NEXT:    s_waitcnt vmcnt(12)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[60:61]
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v16, v17, s[62:63]
-; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:40
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
+; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
+; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76
+; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
+; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80
+; GFX8-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
+; GFX8-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84
+; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
+; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88
+; GFX8-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
+; GFX8-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:92
+; GFX8-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
+; GFX8-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96
+; GFX8-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
+; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:100
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
+; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:104
+; GFX8-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40
+; GFX8-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:108
 ; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
-; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
-; GFX8-NEXT:    s_waitcnt vmcnt(13)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[56:57]
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v14, v15, s[58:59]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    s_waitcnt vmcnt(11)
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v12, v13, s[54:55]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v11
-; GFX8-NEXT:    s_waitcnt vmcnt(10)
-; GFX8-NEXT:    v_and_b32_e32 v11, 1, v20
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[52:53]
-; GFX8-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
-; GFX8-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:116
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v25
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v24
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v23
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v22
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v19
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v11
-; GFX8-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:112
-; GFX8-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:108
-; GFX8-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:96
-; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
-; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:104
-; GFX8-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:100
-; GFX8-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:120
-; GFX8-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:124
-; GFX8-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX8-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX8-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v26
-; GFX8-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v28
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v27
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v29
-; GFX8-NEXT:    v_and_b32_e32 v30, 1, v30
-; GFX8-NEXT:    s_waitcnt vmcnt(14)
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v7, v8, s[50:51]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[48:49]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v30
+; GFX8-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:112
+; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
+; GFX8-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:52
+; GFX8-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
+; GFX8-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
+; GFX8-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:124
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX8-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:128
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v29
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v33
+; GFX8-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[66:67]
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v29, v33, s[64:65]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX8-NEXT:    v_cndmask_b32_e64 v33, v34, v33, s[62:63]
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v30, v32, s[60:61]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v26
+; GFX8-NEXT:    v_cndmask_b32_e64 v32, v34, v32, s[58:59]
+; GFX8-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s[56:57]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v27, v34, v27, s[54:55]
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s[52:53]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v22
+; GFX8-NEXT:    v_cndmask_b32_e64 v25, v34, v25, s[50:51]
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s[48:49]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, v34, v23, s[46:47]
+; GFX8-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[44:45]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, v34, v21, s[42:43]
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s[40:41]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v34, v19, s[38:39]
+; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[36:37]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v34, v17, s[34:35]
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[30:31]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v34, v15, s[28:29]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[26:27]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v34, v13, s[24:25]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[22:23]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v34, v11, s[20:21]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[18:19]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v34, v9, s[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[14:15]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v34, v7, s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v34, v5, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v34, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
+; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v17
+; GFX8-NEXT:    v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v33
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v28
+; GFX8-NEXT:    v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v12, v24, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_readlane_b32 s67, v31, 35
 ; GFX8-NEXT:    v_readlane_b32 s66, v31, 34
 ; GFX8-NEXT:    v_readlane_b32 s65, v31, 33
@@ -29427,18 +28794,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_readlane_b32 s60, v31, 28
 ; GFX8-NEXT:    v_readlane_b32 s59, v31, 27
 ; GFX8-NEXT:    v_readlane_b32 s58, v31, 26
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v5, v6, s[46:47]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[44:45]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v9, v10, s[42:43]
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[40:41]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_readlane_b32 s57, v31, 25
 ; GFX8-NEXT:    v_readlane_b32 s56, v31, 24
 ; GFX8-NEXT:    v_readlane_b32 s55, v31, 23
@@ -29457,43 +28812,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_readlane_b32 s42, v31, 10
 ; GFX8-NEXT:    v_readlane_b32 s41, v31, 9
 ; GFX8-NEXT:    v_readlane_b32 s40, v31, 8
-; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
-; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, v9, v8, s[36:37]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v25
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v22, v23, s[38:39]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v10, v9, s[30:31]
-; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, v25, v18, s[34:35]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v24, v16, s[28:29]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v24
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, v16, v10, s[26:27]
-; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, v11, v21, s[24:25]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v21
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v16, s[22:23]
-; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60
-; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v19, v20, s[20:21]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX8-NEXT:    v_cndmask_b32_e64 v19, v19, v20, s[16:17]
-; GFX8-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX8-NEXT:    v_or_b32_sdwa v11, v11, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_readlane_b32 s39, v31, 7
 ; GFX8-NEXT:    v_readlane_b32 s38, v31, 6
 ; GFX8-NEXT:    v_readlane_b32 s37, v31, 5
@@ -29502,33 +28820,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_readlane_b32 s34, v31, 2
 ; GFX8-NEXT:    v_readlane_b32 s31, v31, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v31, 0
-; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_cndmask_b32_e64 v19, v12, v18, s[14:15]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_cndmask_b32_e64 v12, v12, v18, s[12:13]
-; GFX8-NEXT:    v_cndmask_b32_e64 v18, v13, v17, s[10:11]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v17, s[8:9]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, v14, v16, s[6:7]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v16, s[4:5]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_or_b32_sdwa v14, v17, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, v15, v20, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_cndmask_b32_e64 v15, v15, v17, s[18:19]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX8-NEXT:    v_or_b32_sdwa v12, v19, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29544,169 +28835,223 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX9-NEXT:    v_writelane_b32 v31, s30, 0
 ; GFX9-NEXT:    v_writelane_b32 v31, s31, 1
 ; GFX9-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
 ; GFX9-NEXT:    v_writelane_b32 v31, s36, 4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX9-NEXT:    v_writelane_b32 v31, s37, 5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v5
 ; GFX9-NEXT:    v_writelane_b32 v31, s38, 6
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v4
 ; GFX9-NEXT:    v_writelane_b32 v31, s39, 7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v7
 ; GFX9-NEXT:    v_writelane_b32 v31, s40, 8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v6
 ; GFX9-NEXT:    v_writelane_b32 v31, s41, 9
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v9
 ; GFX9-NEXT:    v_writelane_b32 v31, s42, 10
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v8
 ; GFX9-NEXT:    v_writelane_b32 v31, s43, 11
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v11
 ; GFX9-NEXT:    v_writelane_b32 v31, s44, 12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v10
 ; GFX9-NEXT:    v_writelane_b32 v31, s45, 13
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v13
 ; GFX9-NEXT:    v_writelane_b32 v31, s46, 14
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v12
 ; GFX9-NEXT:    v_writelane_b32 v31, s47, 15
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v15
 ; GFX9-NEXT:    v_writelane_b32 v31, s48, 16
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v14
 ; GFX9-NEXT:    v_writelane_b32 v31, s49, 17
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v17
 ; GFX9-NEXT:    v_writelane_b32 v31, s50, 18
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v16
 ; GFX9-NEXT:    v_writelane_b32 v31, s51, 19
-; GFX9-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX9-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v19
 ; GFX9-NEXT:    v_writelane_b32 v31, s52, 20
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v21
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v18
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX9-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v18
 ; GFX9-NEXT:    v_writelane_b32 v31, s53, 21
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v17
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v16
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:72
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v21
 ; GFX9-NEXT:    v_writelane_b32 v31, s54, 22
-; GFX9-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX9-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v20
 ; GFX9-NEXT:    v_writelane_b32 v31, s55, 23
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v15
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v14
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v23
 ; GFX9-NEXT:    v_writelane_b32 v31, s56, 24
-; GFX9-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v22
 ; GFX9-NEXT:    v_writelane_b32 v31, s57, 25
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v13
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v12
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:80
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v25
 ; GFX9-NEXT:    v_writelane_b32 v31, s58, 26
-; GFX9-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v24
 ; GFX9-NEXT:    v_writelane_b32 v31, s59, 27
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v5
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v4
-; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v20
-; GFX9-NEXT:    buffer_load_ushort v20, off, s[0:3], s32
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v27
 ; GFX9-NEXT:    v_writelane_b32 v31, s60, 28
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v26
 ; GFX9-NEXT:    v_writelane_b32 v31, s61, 29
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v29
 ; GFX9-NEXT:    v_writelane_b32 v31, s62, 30
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v28
 ; GFX9-NEXT:    v_writelane_b32 v31, s63, 31
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
 ; GFX9-NEXT:    v_writelane_b32 v31, s64, 32
 ; GFX9-NEXT:    v_writelane_b32 v31, s65, 33
 ; GFX9-NEXT:    v_writelane_b32 v31, s66, 34
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_writelane_b32 v31, s67, 35
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v30
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[66:67], 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v3
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v6
-; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v7
-; GFX9-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v8
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v9
-; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:88
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX9-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v24
-; GFX9-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX9-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX9-NEXT:    v_and_b32_e32 v19, 1, v19
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v11
-; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v23
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v22
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v19
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v10
-; GFX9-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX9-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX9-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX9-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v26
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v25
-; GFX9-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v28
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v27
-; GFX9-NEXT:    v_and_b32_e32 v30, 1, v30
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX9-NEXT:    s_waitcnt vmcnt(12)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v18, v21, s[66:67]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[64:65]
-; GFX9-NEXT:    s_mov_b32 s64, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s64
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v16, v17, s[62:63]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[60:61]
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s64
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:88
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:104
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v14, v15, s[58:59]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
+; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:112
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
+; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cndmask_b32_e64 v29, v32, v33, s[66:67]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX9-NEXT:    v_cndmask_b32_e64 v32, v32, v33, s[64:65]
+; GFX9-NEXT:    v_cndmask_b32_e64 v33, v28, v30, s[62:63]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GFX9-NEXT:    v_cndmask_b32_e64 v28, v28, v30, s[60:61]
+; GFX9-NEXT:    v_cndmask_b32_e64 v30, v26, v27, s[58:59]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s[56:57]
+; GFX9-NEXT:    v_cndmask_b32_e64 v27, v24, v25, s[54:55]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s[52:53]
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, v22, v23, s[50:51]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s[48:49]
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, v20, v21, s[46:47]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[44:45]
+; GFX9-NEXT:    v_cndmask_b32_e64 v21, v18, v19, s[42:43]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s[40:41]
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, v16, v17, s[38:39]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[36:37]
+; GFX9-NEXT:    v_cndmask_b32_e64 v17, v14, v15, s[34:35]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v14, v3, s[56:57]
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s64
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v12, v13, s[54:55]
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[30:31]
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v12, v13, s[28:29]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[52:53]
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:120
-; GFX9-NEXT:    v_perm_b32 v3, v12, v3, s64
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v29
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v4, v5, s[50:51]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[26:27]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v10, v11, s[24:25]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[22:23]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v8, v9, s[20:21]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v6, v7, s[16:17]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v4, v5, s[12:13]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[48:49]
-; GFX9-NEXT:    v_perm_b32 v4, v4, v12, s64
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-NEXT:    v_and_b32_e32 v11, 1, v20
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v11
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:112
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:104
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:96
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v30
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v2, v3, s[8:9]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[4:5]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v2, v5, s4
+; GFX9-NEXT:    v_perm_b32 v2, v4, v7, s4
+; GFX9-NEXT:    v_perm_b32 v3, v6, v9, s4
+; GFX9-NEXT:    v_perm_b32 v4, v8, v11, s4
+; GFX9-NEXT:    v_perm_b32 v5, v10, v13, s4
+; GFX9-NEXT:    v_perm_b32 v6, v12, v15, s4
+; GFX9-NEXT:    v_perm_b32 v7, v14, v17, s4
+; GFX9-NEXT:    v_perm_b32 v8, v16, v19, s4
+; GFX9-NEXT:    v_perm_b32 v9, v18, v21, s4
+; GFX9-NEXT:    v_perm_b32 v10, v20, v23, s4
+; GFX9-NEXT:    v_perm_b32 v11, v22, v25, s4
+; GFX9-NEXT:    v_perm_b32 v12, v24, v27, s4
+; GFX9-NEXT:    v_perm_b32 v13, v26, v30, s4
+; GFX9-NEXT:    v_perm_b32 v14, v28, v33, s4
+; GFX9-NEXT:    v_perm_b32 v15, v32, v29, s4
 ; GFX9-NEXT:    v_readlane_b32 s67, v31, 35
 ; GFX9-NEXT:    v_readlane_b32 s66, v31, 34
 ; GFX9-NEXT:    v_readlane_b32 s65, v31, 33
+; GFX9-NEXT:    v_readlane_b32 s64, v31, 32
 ; GFX9-NEXT:    v_readlane_b32 s63, v31, 31
 ; GFX9-NEXT:    v_readlane_b32 s62, v31, 30
 ; GFX9-NEXT:    v_readlane_b32 s61, v31, 29
@@ -29722,54 +29067,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX9-NEXT:    v_readlane_b32 s51, v31, 19
 ; GFX9-NEXT:    v_readlane_b32 s50, v31, 18
 ; GFX9-NEXT:    v_readlane_b32 s49, v31, 17
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v6, v7, s[46:47]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[44:45]
-; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s64
 ; GFX9-NEXT:    v_readlane_b32 s48, v31, 16
 ; GFX9-NEXT:    v_readlane_b32 s47, v31, 15
 ; GFX9-NEXT:    v_readlane_b32 s46, v31, 14
 ; GFX9-NEXT:    v_readlane_b32 s45, v31, 13
 ; GFX9-NEXT:    v_readlane_b32 s44, v31, 12
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v9, s[42:43]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[40:41]
-; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s64
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v22
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v22, v23, s[38:39]
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v9, v8, s[36:37]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
-; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s64
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v20, v18, s[34:35]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v17, v9, s[30:31]
-; GFX9-NEXT:    v_perm_b32 v8, v9, v8, s64
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v11, v16, s[28:29]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v16, s[26:27]
-; GFX9-NEXT:    v_perm_b32 v9, v11, v9, s64
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v10, v21, s[24:25]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v21
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s[22:23]
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    v_perm_b32 v10, v10, v11, s64
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v19, v24, s[20:21]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v24
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, v19, v20, s[16:17]
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    v_perm_b32 v11, v19, v11, s64
 ; GFX9-NEXT:    v_readlane_b32 s43, v31, 11
 ; GFX9-NEXT:    v_readlane_b32 s42, v31, 10
 ; GFX9-NEXT:    v_readlane_b32 s41, v31, 9
@@ -29782,31 +29084,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX9-NEXT:    v_readlane_b32 s34, v31, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v31, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v31, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_cndmask_b32_e64 v19, v12, v18, s[14:15]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v18, s[12:13]
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_cndmask_b32_e64 v18, v13, v17, s[10:11]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v17, s[8:9]
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, v14, v16, s[6:7]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v16, s[4:5]
-; GFX9-NEXT:    v_perm_b32 v14, v14, v17, s64
-; GFX9-NEXT:    v_perm_b32 v12, v12, v19, s64
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v15, v20, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_cndmask_b32_e64 v15, v15, v17, s[18:19]
-; GFX9-NEXT:    v_perm_b32 v13, v13, v18, s64
-; GFX9-NEXT:    v_perm_b32 v15, v15, v16, s64
-; GFX9-NEXT:    v_readlane_b32 s64, v31, 32
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -29820,205 +29097,208 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 1, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v6
-; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX10-NEXT:    v_and_b32_e32 v16, 1, v16
-; GFX10-NEXT:    s_clause 0x15
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:72
-; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
-; GFX10-NEXT:    buffer_load_ushort v36, off, s[0:3], s32
-; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:76
-; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:12
-; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:80
-; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:16
-; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:20
-; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:84
-; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:88
-; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:24
-; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
-; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:28
-; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:96
-; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:32
-; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:36
-; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:104
-; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:40
-; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:100
-; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:52
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v0
-; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:112
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 1, v2
-; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s7, 1, v4
-; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:120
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s8, 1, v3
-; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:56
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s9, 1, v8
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:116
-; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:108
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s10, 1, v10
-; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:124
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s11, 1, v12
-; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:60
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s12, 1, v14
-; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:128
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s13, 1, v16
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:64
-; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:44
-; GFX10-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX10-NEXT:    v_and_b32_e32 v29, 1, v29
 ; GFX10-NEXT:    v_and_b32_e32 v30, 1, v30
 ; GFX10-NEXT:    v_and_b32_e32 v28, 1, v28
 ; GFX10-NEXT:    v_and_b32_e32 v26, 1, v26
 ; GFX10-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX10-NEXT:    v_writelane_b32 v31, s31, 1
 ; GFX10-NEXT:    v_and_b32_e32 v22, 1, v22
 ; GFX10-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX10-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT:    v_writelane_b32 v31, s34, 2
-; GFX10-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX10-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX10-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX10-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX10-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX10-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GFX10-NEXT:    v_and_b32_e32 v18, 1, v18
-; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX10-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX10-NEXT:    s_clause 0x14
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    buffer_load_ushort v34, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:128
+; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:108
+; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v30
+; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 1, v28
+; GFX10-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 1, v26
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s7, 1, v24
+; GFX10-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s8, 1, v22
+; GFX10-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s9, 1, v20
+; GFX10-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s10, 1, v18
+; GFX10-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s11, 1, v16
+; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s12, 1, v14
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s13, 1, v12
+; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_writelane_b32 v31, s34, 2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s15, 1, v20
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s16, 1, v22
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s17, 1, v24
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s18, 1, v26
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s19, 1, v28
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s20, 1, v30
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s22, 1, v7
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s23, 1, v9
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s27, 1, v17
+; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX10-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX10-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX10-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX10-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX10-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX10-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX10-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX10-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX10-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s14, 1, v10
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s15, 1, v8
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s16, 1, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s17, 1, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s18, 1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s19, 1, v0
 ; GFX10-NEXT:    v_writelane_b32 v31, s35, 3
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s14, 1, v18
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s21, 1, v5
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s24, 1, v11
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s25, 1, v13
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s20, 1, v27
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s21, 1, v25
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s22, 1, v23
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s23, 1, v21
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s24, 1, v19
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s25, 1, v17
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s26, 1, v15
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s28, 1, v19
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s29, 1, v21
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s30, 1, v23
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s31, 1, v25
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s34, 1, v27
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s35, 1, v29
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s27, 1, v13
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s28, 1, v11
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s29, 1, v7
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s30, 1, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s31, 1, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s34, 1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s35, 1, v9
 ; GFX10-NEXT:    s_waitcnt vmcnt(32)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v32
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
 ; GFX10-NEXT:    s_waitcnt vmcnt(31)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v33
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v32, v33, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v33
+; GFX10-NEXT:    s_waitcnt vmcnt(30)
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v34
 ; GFX10-NEXT:    s_waitcnt vmcnt(29)
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v34, v35, s5
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v35
 ; GFX10-NEXT:    s_waitcnt vmcnt(28)
-; GFX10-NEXT:    v_and_b32_e32 v17, 1, v36
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v35
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v34
-; GFX10-NEXT:    s_waitcnt vmcnt(26)
-; GFX10-NEXT:    v_cndmask_b32_e64 v18, v37, v38, s7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v19, 16, v38
-; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v37
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v35, v36, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v36
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v33, v32, s5
+; GFX10-NEXT:    s_waitcnt vmcnt(25)
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v38, v39, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(24)
-; GFX10-NEXT:    v_cndmask_b32_e64 v21, v39, v48, s8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v48
-; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v39
-; GFX10-NEXT:    s_waitcnt vmcnt(22)
-; GFX10-NEXT:    v_cndmask_b32_e64 v24, v50, v49, s9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v49
-; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v50
-; GFX10-NEXT:    s_waitcnt vmcnt(20)
-; GFX10-NEXT:    v_cndmask_b32_e64 v27, v51, v52, s10
-; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v52
-; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v51
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v48
+; GFX10-NEXT:    s_waitcnt vmcnt(23)
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v48, v49, s6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v49
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v39
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v38
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v37
 ; GFX10-NEXT:    s_waitcnt vmcnt(18)
-; GFX10-NEXT:    v_cndmask_b32_e64 v30, v53, v54, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, v53, v54, s10
+; GFX10-NEXT:    s_waitcnt vmcnt(17)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v55
+; GFX10-NEXT:    s_waitcnt vmcnt(16)
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v55, v64, s9
+; GFX10-NEXT:    s_waitcnt vmcnt(15)
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v65, v37, s8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v65
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v64
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v54
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v53
-; GFX10-NEXT:    s_waitcnt vmcnt(16)
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v55, v64, s12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v64
-; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v55
-; GFX10-NEXT:    s_waitcnt vmcnt(12)
-; GFX10-NEXT:    v_cndmask_b32_e64 v37, v68, v65, s13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v65
-; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v68
-; GFX10-NEXT:    v_lshrrev_b32_e32 v49, 16, v67
-; GFX10-NEXT:    v_lshrrev_b32_e32 v50, 16, v66
+; GFX10-NEXT:    v_cndmask_b32_e64 v34, v51, v52, s11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v52
+; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v51
 ; GFX10-NEXT:    s_waitcnt vmcnt(9)
-; GFX10-NEXT:    v_cndmask_b32_e64 v52, v0, v2, s16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v37, v30, v50, s12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v50
+; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX10-NEXT:    v_cndmask_b32_e64 v39, v29, v69, s13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v69
+; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-NEXT:    v_cndmask_b32_e64 v53, v8, v69, s17
-; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v69
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v4, v3, s18
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_cndmask_b32_e64 v64, v10, v12, s19
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e64 v51, v1, v6, s15
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v14, v16, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v24, v22, s15
+; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_cndmask_b32_e64 v51, v68, v20, s16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v68
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, v67, v18, s17
+; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, v28, v26, s14
+; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v67
+; GFX10-NEXT:    v_cndmask_b32_e64 v55, v66, v16, s18
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v66
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cndmask_b32_e64 v65, v14, v12, s19
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
-; GFX10-NEXT:    v_cndmask_b32_e64 v48, v66, v67, s14
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v15, v13, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, v20, v19, s21
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v23, v22, s22
-; GFX10-NEXT:    v_cndmask_b32_e64 v19, v26, v25, s23
-; GFX10-NEXT:    v_cndmask_b32_e64 v20, v29, v28, s24
-; GFX10-NEXT:    v_cndmask_b32_e64 v22, v33, v32, s25
-; GFX10-NEXT:    v_cndmask_b32_e64 v23, v36, v35, s26
-; GFX10-NEXT:    v_cndmask_b32_e64 v25, v39, v38, s27
-; GFX10-NEXT:    v_cndmask_b32_e64 v26, v50, v49, s28
-; GFX10-NEXT:    v_cndmask_b32_e64 v28, v1, v6, s29
-; GFX10-NEXT:    v_cndmask_b32_e64 v17, v0, v2, s30
-; GFX10-NEXT:    v_cndmask_b32_e64 v29, v8, v54, s31
-; GFX10-NEXT:    v_cndmask_b32_e64 v32, v4, v3, s34
-; GFX10-NEXT:    v_cndmask_b32_e64 v33, v10, v12, s35
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v14, v16, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v7, v5, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v9, v11, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v2, v13, v18, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v3, v15, v21, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v4, v19, v24, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v5, v20, v27, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v6, v22, v30, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v7, v23, v34, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v8, v25, v37, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v9, v26, v48, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v10, v28, v51, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v11, v17, v52, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v12, v29, v53, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v13, v32, v55, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v14, v33, v64, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v15, v16, v65, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v66, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v67, v6, v5, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v68, v8, v7, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v69, v10, v9, s22
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v25, v23, s23
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v33, v32, s24
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v36, v35, s25
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v30, v38, s26
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v29, v48, s27
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v28, v26, s28
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v52, v20, s29
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v14, v12, s31
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v64, v16, s30
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v54, v18, s34
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v24, v22, s35
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v4, v3, s4
+; GFX10-NEXT:    v_perm_b32 v0, v0, v65, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v1, v55, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v2, v53, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v20, v51, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v12, v50, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v5, v5, v49, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v6, v39, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v7, v7, v37, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v8, v8, v34, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v9, v9, v27, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v10, v10, v21, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v11, v69, v11, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v12, v68, v19, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v13, v67, v13, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v14, v66, v17, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v15, v16, v15, 0x5040100
 ; GFX10-NEXT:    v_readlane_b32 s35, v31, 3
 ; GFX10-NEXT:    v_readlane_b32 s34, v31, 2
 ; GFX10-NEXT:    v_readlane_b32 s31, v31, 1
@@ -30035,205 +29315,198 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x20
 ; GFX11-NEXT:    scratch_load_u16 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:64
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:4
+; GFX11-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX11-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX11-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX11-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX11-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
+; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX11-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX11-NEXT:    v_and_b32_e32 v16, 1, v16
 ; GFX11-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v32, v33, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX11-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v34, v35, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX11-NEXT:    v_cndmask_b32_e32 v28, v34, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
+; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v36, v37, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX11-NEXT:    v_cndmask_b32_e32 v26, v36, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
+; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(24)
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v38, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v24, v38, v39, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
+; GFX11-NEXT:    v_and_b32_e32 v11, 1, v11
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
-; GFX11-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
 ; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v48, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v22, v48, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
+; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
 ; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v50, v51, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11-NEXT:    v_cndmask_b32_e32 v20, v50, v51, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
 ; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
-; GFX11-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
 ; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v52, v53, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v18, v52, v53, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
-; GFX11-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
 ; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v54, v55, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v16, v54, v55, vcc_lo
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
-; GFX11-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
 ; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v64, v65 :: v_dual_and_b32 v11, 1, v11
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
-; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11-NEXT:    v_dual_cndmask_b32 v14, v64, v65 :: v_dual_and_b32 v19, 1, v19
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
 ; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v66, v67, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
-; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v66, v67, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11-NEXT:    v_and_b32_e32 v23, 1, v23
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
 ; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v68, v69, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
-; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v68, v69, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11-NEXT:    v_and_b32_e32 v21, 1, v21
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v70, v71, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v70, v71, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT:    v_and_b32_e32 v27, 1, v27
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v80, v81, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
-; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v80, v81, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v25, 1, v25
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v82, v83, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v82, v83, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX11-NEXT:    v_and_b32_e32 v31, 1, v31
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v84, v85, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v84, v85, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v29, 1, v29
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v86, v87, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v86, v87, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v31
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v87
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v32, v33, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
+; GFX11-NEXT:    v_cndmask_b32_e32 v29, v34, v35, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
+; GFX11-NEXT:    v_cndmask_b32_e32 v27, v36, v37, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
+; GFX11-NEXT:    v_cndmask_b32_e32 v25, v38, v39, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
+; GFX11-NEXT:    v_cndmask_b32_e32 v23, v48, v49, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, v50, v51, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v19, v52, v53, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX11-NEXT:    v_cndmask_b32_e32 v17, v54, v55, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX11-NEXT:    v_cndmask_b32_e32 v15, v64, v65, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v66, v67, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v68, v69, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v80, v81, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v34, v35, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v84, v85, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v86, v87, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v36, v37, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v38, v39, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v82, v83, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v48, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v50, v51, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v52, v53, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v54, v55, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v70, v71, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v64, v65, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v8, v17, v16, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v66, v67, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
-; GFX11-NEXT:    v_perm_b32 v9, v19, v18, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v68, v69, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v10, v21, v20, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v23, v70, v71, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
 ; GFX11-NEXT:    v_perm_b32 v11, v23, v22, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v25, v80, v81, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v8, v17, v16, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v9, v19, v18, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v12, v25, v24, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v27, v82, v83, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
 ; GFX11-NEXT:    v_perm_b32 v13, v27, v26, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v29, v84, v85, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v14, v29, v28, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v86, v87, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v15, v31, v30, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b
@@ -30446,98 +29719,61 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX8-LABEL: v_fma_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX8-NEXT:    v_fma_f32 v3, v6, v5, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_fma_f32 v2, v6, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX9-NEXT:    v_fma_f32 v3, v6, v5, v3
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX9-NEXT:    v_fma_f32 v2, v6, v4, v2
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
 ; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_fmac_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fma_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v5, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_fmac_f32_e32 v4, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v9, v11, v10 :: v_dual_fmac_f32 v6, v8, v7
-; GFX11-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
 }
@@ -30596,97 +29832,97 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX8-LABEL: v_fma_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_fma_f32 v2, v6, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v4
+; GFX8-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_fma_f32 v3, v7, v5, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_fma_f32 v6, v8, v7, v6
+; GFX9-NEXT:    v_fma_f32 v3, v7, v5, v3
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX9-NEXT:    v_fma_f32 v2, v6, v4, v2
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
-; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
 ; GFX10-NEXT:    v_fmac_f32_e32 v9, v11, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
 ; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v4, v9, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v5, v6, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_fmac_f32 v9, v11, v10 :: v_dual_lshlrev_b32 v6, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-NEXT:    v_dual_fmac_f32 v4, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v5, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_fmac_f32_e32 v4, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v9, v11, v10 :: v_dual_fmac_f32 v6, v8, v7
-; GFX11-NEXT:    v_perm_b32 v1, v5, v9, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v4, v9, 0x7060302
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v6, v8, v7
+; GFX11-NEXT:    v_perm_b32 v1, v5, v6, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
   ret <4 x bfloat> %op
@@ -30948,133 +30184,79 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-LABEL: v_fmuladd_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX10-NEXT:    v_mul_f32_e32 v6, v8, v7
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_mul_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v7
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX10-NEXT:    v_add_f32_e32 v3, v4, v6
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fmuladd_v3bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT:    v_dual_mul_f32 v6, v7, v6 :: v_dual_lshlrev_b32 v7, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_mul_f32_e32 v6, v8, v7
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v3 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v2, v2, v7 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_add_f32_e32 v3, v4, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
 }
@@ -31149,132 +30331,131 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-LABEL: v_fmuladd_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_alignbit_b32 v0, v0, v6, 16
-; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, 16
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v3, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fmuladd_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
 ; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_perm_b32 v0, v0, v6, s4
-; GFX9-NEXT:    v_perm_b32 v1, v1, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmuladd_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX10-NEXT:    v_mul_f32_e32 v6, v8, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_mul_f32_e32 v3, v8, v7
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v7
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
-; GFX10-NEXT:    v_add_f32_e32 v3, v4, v6
+; GFX10-NEXT:    v_add_f32_e32 v2, v6, v2
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmuladd_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX11-NEXT:    v_dual_mul_f32 v6, v7, v6 :: v_dual_lshlrev_b32 v7, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_mul_f32_e32 v6, v8, v7
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v2, v6, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    v_mul_f32_e32 v3, v8, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v3 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v2, v2, v7 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_add_f32_e32 v3, v4, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
   ret <4 x bfloat> %op
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index ef2be37d62d9df..160bae51193ea0 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -3165,11 +3165,11 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
@@ -3178,16 +3178,14 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v19, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
@@ -3205,11 +3203,11 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -3219,16 +3217,14 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v16, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v17, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
@@ -3236,11 +3232,11 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x5
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:16
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
@@ -3260,20 +3256,17 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    buffer_store_b32 v32, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    buffer_store_b32 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b32 v35, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    buffer_store_b32 v35, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b32 v36, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_store_b32 v36, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b64 v[32:33], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index ca3336beccf773..8dcd0f504e5091 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -948,57 +948,52 @@ define <3 x i1> @isnan_v3bf16(<3 x bfloat> %x) nounwind {
 ; GFX8CHECK-LABEL: isnan_v3bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8CHECK-NEXT:    v_and_b32_e32 v2, 0x7fff, v1
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX8CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX8CHECK-NEXT:    v_bfe_u32 v1, v0, 16, 15
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v2
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_v3bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
 ; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX9CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v1
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_sdwa s[4:5], v3, s4 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v3
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_v3bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff, v0
+; GFX10CHECK-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v0
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v3, 0x7f80
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v4, 0x7fff, v1
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v2
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_sdwa s4, v2, v3 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v4
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_v3bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff, v1
-; GFX11CHECK-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v2
@@ -1032,18 +1027,17 @@ define <4 x i1> @isnan_v4bf16(<4 x bfloat> %x) nounwind {
 ; GFX8CHECK-LABEL: isnan_v4bf16:
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX8CHECK-NEXT:    v_bfe_u32 v3, v1, 16, 15
 ; GFX8CHECK-NEXT:    s_movk_i32 s4, 0x7f80
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX8CHECK-NEXT:    v_and_b32_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX8CHECK-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v1
+; GFX8CHECK-NEXT:    v_bfe_u32 v1, v0, 16, 15
+; GFX8CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v0
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v1
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v4
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v2
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s4, v3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1051,56 +1045,49 @@ define <4 x i1> @isnan_v4bf16(<4 x bfloat> %x) nounwind {
 ; GFX9CHECK-LABEL: isnan_v4bf16:
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9CHECK-NEXT:    s_movk_i32 s5, 0x7f80
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v4, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9CHECK-NEXT:    v_and_b32_sdwa v3, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s5, v0
+; GFX9CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v1
+; GFX9CHECK-NEXT:    s_movk_i32 s6, 0x7f80
+; GFX9CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s6, v1
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_sdwa s[4:5], v1, s6 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s5, v1
+; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s6, v3
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    v_cmp_gt_i16_sdwa s[4:5], v3, s6 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s5, v4
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s5, v3
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_v4bf16:
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff, v0
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10CHECK-NEXT:    v_and_b32_e32 v4, 0x7fff, v1
+; GFX10CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v5, 0x7f80
+; GFX10CHECK-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v1
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10CHECK-NEXT:    v_and_b32_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_sdwa s4, v3, v5 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v4
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10CHECK-NEXT:    v_cmp_gt_i16_sdwa s4, v4, v5 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v5
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_v4bf16:
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
-; GFX11CHECK-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
+; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT:    v_and_b32_e32 v4, 0x7fff, v2
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v4
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call <4 x i1> @llvm.is.fpclass.v4bf16(<4 x bfloat> %x, i32 3)  ; nan
diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll
index 8ca33dce73f880..5b9866a3c91571 100644
--- a/llvm/test/CodeGen/AMDGPU/select-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll
@@ -303,9 +303,8 @@ ret:
   ret void
 }
 
-; FIXME: This shouldn't have the 0 initialization
 ; GCN-LABEL: {{^}}undef_v3bf16:
-; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
 ; GCN: s_cbranch_vccnz
 define amdgpu_kernel void @undef_v3bf16(ptr addrspace(3) %ptr, i1 %cond) {
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index dfe98bbbaddf9e..d76bb48b4a82a3 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -2703,30 +2703,30 @@ define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_234u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_234u:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_234u:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -2935,32 +2935,35 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_357u:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v4, v6, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_357u:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
-; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_357u:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_alignbit_b32 v1, s0, v3, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -2972,24 +2975,29 @@ define <4 x bfloat> @shuffle_v4bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_0101:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0101:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0101:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3029,31 +3037,39 @@ define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_0145:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v4, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v5, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0145:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v4, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v5, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0145:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, v1, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3065,30 +3081,33 @@ define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_0167:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v5, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0167:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v5, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0167:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3102,7 +3121,9 @@ define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3111,6 +3132,7 @@ define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3119,6 +3141,7 @@ define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, v1, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3161,30 +3184,33 @@ define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_2345:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v5, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_2345:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v5, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_2345:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3233,33 +3259,39 @@ define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_4501:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v4, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v5, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_4501:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[2:3], off
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v4, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v5, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_4501:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[3:4], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, v3, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3271,32 +3303,33 @@ define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_4523:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v5, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_4523:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[2:3], off
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v5, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_4523:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3309,24 +3342,29 @@ define <4 x bfloat> @shuffle_v4bf16_4545(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_4545:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_4545:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_4545:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3366,32 +3404,33 @@ define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_6701:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v5, v5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_6701:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
-; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v5, v5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_6701:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v0, v[2:3], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, v4, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3443,7 +3482,9 @@ define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3452,6 +3493,7 @@ define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3460,6 +3502,7 @@ define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, v1, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
@@ -3578,31 +3621,34 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_3456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v4, v6, 16
-; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_3456:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
-; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_3456:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX11-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
@@ -3693,7 +3739,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_0000:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
@@ -3703,7 +3749,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-LABEL: shuffle_v4bf16_0000:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -3712,7 +3758,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-LABEL: shuffle_v4bf16_0000:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -3728,7 +3774,7 @@ define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_1010:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
@@ -3737,7 +3783,7 @@ define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-LABEL: shuffle_v4bf16_1010:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -3746,7 +3792,7 @@ define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-LABEL: shuffle_v4bf16_1010:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -3762,7 +3808,7 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_1100:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -3773,7 +3819,7 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-LABEL: shuffle_v4bf16_1100:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
@@ -3782,7 +3828,7 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-LABEL: shuffle_v4bf16_1100:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
@@ -3897,24 +3943,29 @@ define <4 x bfloat> @shuffle_v8bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v8bf16_0101:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8bf16_0101:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8bf16_0101:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
@@ -3954,31 +4005,39 @@ define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v8bf16_4589:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:8
-; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:8
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v4, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v5, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8bf16_4589:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:8
-; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:8
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v4, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_bfi_b32 v1, 0xffff, v5, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8bf16_4589:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:8
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:8
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v1, 0xffff, v1, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
@@ -4067,9 +4126,11 @@ define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v1, v1, v1, s4
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v0
+; GFX9-NEXT:    v_perm_b32 v1, v1, v1, s5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v3bf16_0122:
@@ -4077,6 +4138,7 @@ define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4085,6 +4147,7 @@ define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0
@@ -4190,27 +4253,27 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_fma_f32 v7, v8, v9, v7
 ; GFX9-NEXT:    v_fma_f32 v0, v8, v4, v0
-; GFX9-NEXT:    v_fma_f32 v4, v11, v4, v12
-; GFX9-NEXT:    v_fma_f32 v1, v11, v9, v1
+; GFX9-NEXT:    v_fma_f32 v8, v12, v9, v11
+; GFX9-NEXT:    v_fma_f32 v1, v12, v4, v1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
 ; GFX9-NEXT:    v_fma_f32 v0, v2, v10, v0
-; GFX9-NEXT:    v_fma_f32 v2, v2, v5, v7
-; GFX9-NEXT:    v_fma_f32 v1, v3, v5, v1
-; GFX9-NEXT:    v_fma_f32 v3, v3, v10, v4
+; GFX9-NEXT:    v_fma_f32 v2, v2, v5, v4
+; GFX9-NEXT:    v_fma_f32 v1, v3, v10, v1
+; GFX9-NEXT:    v_fma_f32 v3, v3, v5, v7
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s0
 ; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s0
-; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s0
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4233,27 +4296,27 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v9
 ; GFX10-NEXT:    v_fmac_f32_e32 v0, v8, v4
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_fmac_f32_e32 v12, v11, v9
-; GFX10-NEXT:    v_fmac_f32_e32 v1, v11, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v11, v10, v4
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v10, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v5
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v11
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_fmac_f32_e32 v0, v2, v10
-; GFX10-NEXT:    v_fmac_f32_e32 v4, v2, v5
-; GFX10-NEXT:    v_fmac_f32_e32 v7, v3, v5
-; GFX10-NEXT:    v_fmac_f32_e32 v1, v3, v10
-; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x7060302
-; GFX10-NEXT:    v_perm_b32 v1, v7, v1, 0x7060302
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_fmac_f32_e32 v0, v2, v12
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v3, v12
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v3, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v2, v5
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v7, v0, 0x7060302
 ; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -4269,36 +4332,36 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
 ; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v1, v11, v4 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v1, v10, v9
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v1, v3, v10 :: v_dual_fmac_f32 v0, v8, v4
+; GFX11-NEXT:    v_dual_fmac_f32 v1, v3, v5 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT:    v_dual_fmac_f32 v11, v10, v4 :: v_dual_lshlrev_b32 v8, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_fmac_f32 v7, v8, v9 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_fmac_f32_e32 v0, v8, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v0, v2, v10
-; GFX11-NEXT:    v_fmac_f32_e32 v12, v11, v9
-; GFX11-NEXT:    v_fmac_f32_e32 v7, v8, v9
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v4, v2, v5 :: v_dual_and_b32 v7, 0xffff0000, v12
-; GFX11-NEXT:    v_fmac_f32_e32 v7, v3, v5
+; GFX11-NEXT:    v_dual_fmac_f32 v4, v3, v12 :: v_dual_fmac_f32 v7, v2, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v4, v0, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v7, v1, 0x7060302
+; GFX11-NEXT:    v_fmac_f32_e32 v0, v2, v12
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v7, v0, 0x7060302
 ; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4336,28 +4399,32 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX9-LABEL: shuffle_v4bf16_0456:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v4, v6, s4
-; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, v6, v5, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_0456:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x5040100
-; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
+; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4bf16_0456:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100

>From 584120967a4d717a150dd7c75d575d343d52ded2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 4 Jan 2024 23:25:14 +0700
Subject: [PATCH 4/4] Update select-undef.ll

---
 llvm/test/CodeGen/AMDGPU/select-undef.ll | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll
index 8ca33dce73f880..5b9866a3c91571 100644
--- a/llvm/test/CodeGen/AMDGPU/select-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll
@@ -303,9 +303,8 @@ ret:
   ret void
 }
 
-; FIXME: This shouldn't have the 0 initialization
 ; GCN-LABEL: {{^}}undef_v3bf16:
-; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
 ; GCN: s_cbranch_vccnz
 define amdgpu_kernel void @undef_v3bf16(ptr addrspace(3) %ptr, i1 %cond) {



More information about the cfe-commits mailing list