[llvm] [GISel] Explicitly disable BF16 tablegen patterns. (PR #124113)

David Green via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 24 03:52:28 PST 2025


https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/124113

>From e33932c51bc66801103bc0af9fbf2464411c16e3 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Fri, 24 Jan 2025 11:51:25 +0000
Subject: [PATCH] [GISel] Explicitly disable BF16 tablegen patterns and
 codegen.

We currently have an issue where bf16 patters can be used to match fp16 types,
as GISel does not know about the difference between the two types. This patch
explicitly disables them to make sure that they are never used.

The opposite can also happen too, where fp16 patterns are used for operators
that should be bf16. So any operations with bf16 types now cause a fallback to
SDAG. For the moment this includes data-processing only instructions (loads,
stores, shufles, etc).

The pass setup for GISel has been slightly adjusted to make sure that a verify
pass does not get added between AMD-SDAG and SIFixSGPRCopiesPass, which
otherwise can cause verifier issues when falling back.
---
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |   13 +-
 llvm/lib/CodeGen/TargetPassConfig.cpp         |   13 +-
 .../test/CodeGen/AArch64/bf16-instructions.ll | 1672 +++++---------
 llvm/test/CodeGen/AArch64/concat-vector.ll    |    4 +-
 llvm/test/CodeGen/AArch64/dup.ll              |   51 +-
 llvm/test/CodeGen/AArch64/fptrunc.ll          |    1 +
 .../AMDGPU/GlobalISel/atomic_load_flat.ll     |   61 +-
 .../AMDGPU/GlobalISel/atomic_load_global.ll   |   18 +-
 .../AMDGPU/GlobalISel/atomic_load_local_2.ll  |    9 +-
 .../AMDGPU/GlobalISel/irtranslate-bf16.ll     |  161 +-
 .../AMDGPU/GlobalISel/irtranslator-call.ll    |   50 +-
 .../GlobalISel/irtranslator-function-args.ll  |   24 +-
 .../GlobalISel/llvm.amdgcn.set.inactive.ll    |   36 +-
 ...ffer-fat-pointers-contents-legalization.ll |    7 +-
 .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll     | 2034 +++++++----------
 .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll |   14 +-
 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll |   14 +-
 llvm/test/CodeGen/AMDGPU/fptoi.i128.ll        |  918 ++------
 .../CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll |   48 +-
 .../isel-amdgpu-cs-chain-preserve-cc.ll       |   46 +-
 .../llvm.amdgcn.cvt.scalef32.pk.gfx950.ll     |  198 +-
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll     |  264 +--
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll     |    2 +-
 .../llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll  |    2 +-
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll  |  252 +-
 .../test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.ll |    2 +-
 .../AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll   |    8 +-
 .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll      |   28 +-
 .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll  |    2 +-
 .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll  |    2 +-
 .../AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll |   32 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll |  248 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll    |    6 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll    |    6 +-
 ...mdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll |   18 +-
 .../AMDGPU/llvm.amdgcn.readfirstlane.ll       |    2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll    |    2 +-
 .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll       | 1110 +++------
 .../AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll   |    6 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll   |   10 +-
 .../RISCV/GlobalISel/irtranslator/vec-ret.ll  |  100 +-
 llvm/utils/TableGen/GlobalISelEmitter.cpp     |   21 +
 42 files changed, 2483 insertions(+), 5032 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index f668e41094bbc8..2c46c6a6b768b6 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1553,10 +1553,6 @@ bool IRTranslator::translateBitCast(const User &U,
 
 bool IRTranslator::translateCast(unsigned Opcode, const User &U,
                                  MachineIRBuilder &MIRBuilder) {
-  if (U.getType()->getScalarType()->isBFloatTy() ||
-      U.getOperand(0)->getType()->getScalarType()->isBFloatTy())
-    return false;
-
   uint32_t Flags = 0;
   if (const Instruction *I = dyn_cast<Instruction>(&U))
     Flags = MachineInstr::copyFlagsFromInstruction(*I);
@@ -3618,6 +3614,15 @@ bool IRTranslator::translate(const Instruction &Inst) {
   CurBuilder->setPCSections(Inst.getMetadata(LLVMContext::MD_pcsections));
   CurBuilder->setMMRAMetadata(Inst.getMetadata(LLVMContext::MD_mmra));
 
+  // BF16 cannot currently be represented by LLT, to avoid miscompiles we
+  // prevent any instructions using them. FIXME: This can be removed once LLT
+  // supports bfloat.
+  if (Inst.getType()->getScalarType()->isBFloatTy() ||
+      any_of(Inst.operands(), [](Value *V) {
+        return V->getType()->getScalarType()->isBFloatTy();
+      }))
+    return false;
+
   if (TLI->fallBackToDAGISel(Inst))
     return false;
 
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index d8d9f38da3eae0..847a1aef39c565 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1017,7 +1017,7 @@ bool TargetPassConfig::addCoreISelPasses() {
   if (Selector != SelectorType::GlobalISel || !isGlobalISelAbortEnabled())
     DebugifyIsSafe = false;
 
-  // Add instruction selector passes.
+  // Add instruction selector passes for global isel if enabled.
   if (Selector == SelectorType::GlobalISel) {
     SaveAndRestore SavedAddingMachinePasses(AddingMachinePasses, true);
     if (addIRTranslator())
@@ -1043,15 +1043,14 @@ bool TargetPassConfig::addCoreISelPasses() {
     // Pass to reset the MachineFunction if the ISel failed.
     addPass(createResetMachineFunctionPass(
         reportDiagnosticWhenGlobalISelFallback(), isGlobalISelAbortEnabled()));
+  }
 
-    // Provide a fallback path when we do not want to abort on
-    // not-yet-supported input.
-    if (!isGlobalISelAbortEnabled() && addInstSelector())
+  // Run the SDAG InstSelector, providing a fallback path when we do not want to
+  // abort on not-yet-supported input.
+  if (Selector != SelectorType::GlobalISel || !isGlobalISelAbortEnabled())
+    if (addInstSelector())
       return true;
 
-  } else if (addInstSelector())
-    return true;
-
   // Expand pseudo-instructions emitted by ISel. Don't run the verifier before
   // FinalizeISel.
   addPass(&FinalizeISelID);
diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
index ecf64ecbbd3fff..36c7e13e8a503a 100644
--- a/llvm/test/CodeGen/AArch64/bf16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
@@ -3,7 +3,39 @@
 ; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16,CHECK-SD
 ; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+bf16,+fullfp16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for test_fptosi_i32
+; CHECK-GI:       warning: Instruction selection used fallback path for test_fadd
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fsub
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fmul
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fmadd
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fdiv
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_frem
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_store
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_load
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_call
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_call_flipped
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_tailcall_flipped
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_select
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_select_cc
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_select_cc_f32_f16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_select_cc_f16_f32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_une
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_ueq
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_ugt
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_uge
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_ult
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_ule
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_uno
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_one
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_oeq
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_ogt
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_oge
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_olt
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_ole
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_ord
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fccmp
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_br_cc
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_phi
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fptosi_i32
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fptosi_i64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fptoui_i32
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fptoui_i64
@@ -17,9 +49,42 @@
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fptrunc_double
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fpext_float
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fpext_double
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_bitcast_bfloattoi16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_bitcast_i16tobfloat
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_sqrt
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_powi
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_sin
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_cos
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_tan
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_acos
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_asin
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_atan
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_atan2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_cosh
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_sinh
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_tanh
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_pow
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_exp
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_exp2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_log
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_log10
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_log2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fma
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fabs
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_minnum
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_maxnum
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_copysign
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_copysign_f32
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_copysign_f64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_copysign_extended
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_floor
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_ceil
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_trunc
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_rint
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_nearbyint
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_round
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_roundeven
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fmuladd
 
 define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_fadd:
@@ -39,20 +104,15 @@ define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_fadd:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fadd s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fadd:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fadd h0, h0, h1
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_fadd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = fadd bfloat %a, %b
   ret bfloat %r
 }
@@ -75,20 +135,15 @@ define bfloat @test_fsub(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_fsub:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fsub s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fsub:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fsub h0, h0, h1
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_fsub:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fsub s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = fsub bfloat %a, %b
   ret bfloat %r
 }
@@ -111,20 +166,15 @@ define bfloat @test_fmul(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_fmul:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fmul s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fmul:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmul h0, h0, h1
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_fmul:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = fmul bfloat %a, %b
   ret bfloat %r
 }
@@ -157,25 +207,20 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_fmadd:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    // kill: def $h2 killed $h2 def $d2
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fmul s0, s0, s1
-; CHECK-SD-NEXT:    shll v1.4s, v2.4h, #16
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fadd s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fmadd:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmadd h0, h0, h1, h2
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_fmadd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul s0, s0, s1
+; CHECK-BF16-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %mul = fmul fast bfloat %a, %b
   %r = fadd fast bfloat %mul, %c
   ret bfloat %r
@@ -199,20 +244,15 @@ define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_fdiv:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fdiv s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fdiv:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fdiv h0, h0, h1
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_fdiv:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fdiv s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = fdiv bfloat %a, %b
   ret bfloat %r
 }
@@ -239,29 +279,19 @@ define bfloat @test_frem(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_frem:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 killed $q1
-; CHECK-SD-NEXT:    bl fmodf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_frem:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    fcvt s1, h1
-; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_frem:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 killed $q1
+; CHECK-BF16-NEXT:    bl fmodf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = frem bfloat %a, %b
   ret bfloat %r
 }
@@ -343,14 +373,8 @@ define bfloat @test_select(bfloat %a, bfloat %b, i1 zeroext %c) #0 {
 ;
 ; CHECK-GI-LABEL: test_select:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-GI-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    tst w0, #0x1
-; CHECK-GI-NEXT:    csel w8, w8, w9, ne
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    fcsel h0, h0, h1, ne
 ; CHECK-GI-NEXT:    ret
   %r = select i1 %c, bfloat %a, bfloat %b
   ret bfloat %r
@@ -385,14 +409,12 @@ define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 {
 ;
 ; CHECK-GI-LABEL: test_select_cc:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-GI-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-GI-NEXT:    fcmp h2, h3
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    csel w8, w8, w9, ne
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-GI-NEXT:    // kill: def $h3 killed $h3 def $d3
+; CHECK-GI-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-GI-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-GI-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-GI-NEXT:    fcmp s2, s3
+; CHECK-GI-NEXT:    fcsel h0, h0, h1, ne
 ; CHECK-GI-NEXT:    ret
   %cc = fcmp une bfloat %c, %d
   %r = select i1 %cc, bfloat %a, bfloat %b
@@ -400,31 +422,15 @@ define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 {
 }
 
 define float @test_select_cc_f32_f16(float %a, float %b, bfloat %c, bfloat %d) #0 {
-; CHECK-CVT-LABEL: test_select_cc_f32_f16:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h3 killed $h3 def $d3
-; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $d2
-; CHECK-CVT-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-CVT-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-CVT-NEXT:    fcmp s2, s3
-; CHECK-CVT-NEXT:    fcsel s0, s0, s1, ne
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_select_cc_f32_f16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h3 killed $h3 def $d3
-; CHECK-SD-NEXT:    // kill: def $h2 killed $h2 def $d2
-; CHECK-SD-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-SD-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-SD-NEXT:    fcmp s2, s3
-; CHECK-SD-NEXT:    fcsel s0, s0, s1, ne
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_select_cc_f32_f16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h2, h3
-; CHECK-GI-NEXT:    fcsel s0, s0, s1, ne
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_select_cc_f32_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h3 killed $h3 def $d3
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    fcmp s2, s3
+; CHECK-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-NEXT:    ret
   %cc = fcmp une bfloat %c, %d
   %r = select i1 %cc, float %a, float %b
   ret float %r
@@ -451,14 +457,8 @@ define bfloat @test_select_cc_f16_f32(bfloat %a, bfloat %b, float %c, float %d)
 ;
 ; CHECK-GI-LABEL: test_select_cc_f16_f32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-GI-NEXT:    // kill: def $h1 killed $h1 def $s1
 ; CHECK-GI-NEXT:    fcmp s2, s3
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    csel w8, w8, w9, ne
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-GI-NEXT:    fcsel h0, h0, h1, ne
 ; CHECK-GI-NEXT:    ret
   %cc = fcmp une float %c, %d
   %r = select i1 %cc, bfloat %a, bfloat %b
@@ -466,429 +466,199 @@ define bfloat @test_select_cc_f16_f32(bfloat %a, bfloat %b, float %c, float %d)
 }
 
 define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_une:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, ne
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_une:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, ne
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_une:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, ne
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_une:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
   %r = fcmp une bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ueq:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w8, eq
-; CHECK-CVT-NEXT:    csinc w0, w8, wzr, vc
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_ueq:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w8, eq
-; CHECK-SD-NEXT:    csinc w0, w8, wzr, vc
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_ueq:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w8, eq
-; CHECK-GI-NEXT:    cset w9, vs
-; CHECK-GI-NEXT:    orr w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_ueq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    csinc w0, w8, wzr, vc
+; CHECK-NEXT:    ret
   %r = fcmp ueq bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ugt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, hi
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_ugt:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, hi
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_ugt:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, hi
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, hi
+; CHECK-NEXT:    ret
   %r = fcmp ugt bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_uge:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, pl
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_uge:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, pl
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_uge:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, pl
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_uge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, pl
+; CHECK-NEXT:    ret
   %r = fcmp uge bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ult:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, lt
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_ult:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, lt
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_ult:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, lt
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
   %r = fcmp ult bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ule:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, le
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_ule:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, le
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_ule:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, le
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, le
+; CHECK-NEXT:    ret
   %r = fcmp ule bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_uno:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, vs
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_uno:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, vs
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_uno:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, vs
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_uno:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, vs
+; CHECK-NEXT:    ret
   %r = fcmp uno bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_one:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w8, mi
-; CHECK-CVT-NEXT:    csinc w0, w8, wzr, le
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_one:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w8, mi
-; CHECK-SD-NEXT:    csinc w0, w8, wzr, le
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_one:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w8, mi
-; CHECK-GI-NEXT:    cset w9, gt
-; CHECK-GI-NEXT:    orr w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w8, mi
+; CHECK-NEXT:    csinc w0, w8, wzr, le
+; CHECK-NEXT:    ret
   %r = fcmp one bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_oeq:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, eq
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_oeq:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, eq
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_oeq:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, eq
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_oeq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
   %r = fcmp oeq bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ogt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, gt
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_ogt:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, gt
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_ogt:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, gt
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_ogt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
   %r = fcmp ogt bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_oge:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, ge
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_oge:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, ge
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_oge:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, ge
-; CHECK-GI-NEXT:    ret
-  %r = fcmp oge bfloat %a, %b
-  ret i1 %r
-}
-
-define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_olt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, mi
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_olt:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, mi
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_olt:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, mi
-; CHECK-GI-NEXT:    ret
-  %r = fcmp olt bfloat %a, %b
-  ret i1 %r
-}
-
-define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ole:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, ls
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_ole:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, ls
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_ole:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, ls
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_oge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, ge
+; CHECK-NEXT:    ret
+  %r = fcmp oge bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_olt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, mi
+; CHECK-NEXT:    ret
+  %r = fcmp olt bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ole:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, ls
+; CHECK-NEXT:    ret
   %r = fcmp ole bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ord:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, vc
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_ord:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, vc
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_ord:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, vc
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_ord:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, vc
+; CHECK-NEXT:    ret
   %r = fcmp ord bfloat %a, %b
   ret i1 %r
 }
@@ -924,15 +694,15 @@ define void @test_fccmp(bfloat %in, ptr %out) {
 ;
 ; CHECK-GI-LABEL: test_fccmp:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2s, #69, lsl #24
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-GI-NEXT:    shll v2.4s, v0.4h, #16
+; CHECK-GI-NEXT:    movi v3.2s, #72, lsl #24
+; CHECK-GI-NEXT:    fcmp s2, s1
 ; CHECK-GI-NEXT:    fmov h1, #5.00000000
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-GI-NEXT:    fmov h2, #8.00000000
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fccmp h0, h2, #4, mi
-; CHECK-GI-NEXT:    csel w8, w8, w9, gt
-; CHECK-GI-NEXT:    strh w8, [x0]
+; CHECK-GI-NEXT:    fccmp s2, s3, #4, mi
+; CHECK-GI-NEXT:    fcsel h0, h0, h1, gt
+; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    ret
   %cmp1 = fcmp ogt bfloat %in, 0xR4800
   %cmp2 = fcmp olt bfloat %in, 0xR4500
@@ -943,34 +713,16 @@ define void @test_fccmp(bfloat %in, ptr %out) {
 }
 
 define void @test_br_cc(bfloat %a, bfloat %b, ptr %p1, ptr %p2) #0 {
-; CHECK-CVT-LABEL: test_br_cc:
-; CHECK-CVT:       // %bb.0: // %common.ret
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    csel x8, x0, x1, pl
-; CHECK-CVT-NEXT:    str wzr, [x8]
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_br_cc:
-; CHECK-SD:       // %bb.0: // %common.ret
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    csel x8, x0, x1, pl
-; CHECK-SD-NEXT:    str wzr, [x8]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_br_cc:
-; CHECK-GI:       // %bb.0: // %common.ret
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    csel x8, x0, x1, pl
-; CHECK-GI-NEXT:    str wzr, [x8]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_br_cc:
+; CHECK:       // %bb.0: // %common.ret
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    csel x8, x0, x1, pl
+; CHECK-NEXT:    str wzr, [x8]
+; CHECK-NEXT:    ret
   %c = fcmp uge bfloat %a, %b
   br i1 %c, label %then, label %else
 then:
@@ -1426,18 +1178,13 @@ define bfloat @test_sqrt(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_sqrt:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fsqrt s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_sqrt:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fsqrt h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_sqrt:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fsqrt s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.sqrt.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1461,25 +1208,16 @@ define bfloat @test_powi(bfloat %a, i32 %b) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_powi:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl __powisf2
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_powi:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_powi:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl __powisf2
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.powi.f16.i32(bfloat %a, i32 %b)
   ret bfloat %r
 }
@@ -1504,25 +1242,16 @@ define bfloat @test_sin(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_sin:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl sinf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_sin:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_sin:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl sinf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.sin.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1546,25 +1275,16 @@ define bfloat @test_cos(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_cos:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl cosf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_cos:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_cos:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl cosf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.cos.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1588,25 +1308,16 @@ define bfloat @test_tan(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_tan:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl tanf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_tan:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl tanf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_tan:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl tanf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.tan.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1630,25 +1341,16 @@ define bfloat @test_acos(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_acos:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl acosf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_acos:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl acosf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_acos:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl acosf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.acos.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1672,25 +1374,16 @@ define bfloat @test_asin(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_asin:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl asinf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_asin:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl asinf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_asin:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl asinf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.asin.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1714,25 +1407,16 @@ define bfloat @test_atan(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_atan:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl atanf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_atan:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl atanf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_atan:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl atanf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.atan.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1759,29 +1443,19 @@ define bfloat @test_atan2(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_atan2:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 killed $q1
-; CHECK-SD-NEXT:    bl atan2f
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_atan2:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    fcvt s1, h1
-; CHECK-GI-NEXT:    bl atan2f
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_atan2:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 killed $q1
+; CHECK-BF16-NEXT:    bl atan2f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.atan2.f16(bfloat %a, bfloat %b)
   ret bfloat %r
 }
@@ -1805,25 +1479,16 @@ define bfloat @test_cosh(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_cosh:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl coshf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_cosh:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl coshf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_cosh:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl coshf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.cosh.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1847,25 +1512,16 @@ define bfloat @test_sinh(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_sinh:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl sinhf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_sinh:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl sinhf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_sinh:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl sinhf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.sinh.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1889,25 +1545,16 @@ define bfloat @test_tanh(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_tanh:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl tanhf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_tanh:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl tanhf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_tanh:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl tanhf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.tanh.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1934,29 +1581,19 @@ define bfloat @test_pow(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_pow:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 killed $q1
-; CHECK-SD-NEXT:    bl powf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_pow:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    fcvt s1, h1
-; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_pow:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 killed $q1
+; CHECK-BF16-NEXT:    bl powf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.pow.f16(bfloat %a, bfloat %b)
   ret bfloat %r
 }
@@ -1973,32 +1610,23 @@ define bfloat @test_exp(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
-; CHECK-CVT-NEXT:    add w8, w10, w8
-; CHECK-CVT-NEXT:    lsr w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_exp:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl expf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-GI-LABEL: test_exp:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_exp:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl expf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.exp.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2022,25 +1650,16 @@ define bfloat @test_exp2(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_exp2:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl exp2f
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_exp2:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_exp2:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl exp2f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.exp2.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2064,25 +1683,16 @@ define bfloat @test_log(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_log:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl logf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_log:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_log:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl logf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.log.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2106,25 +1716,16 @@ define bfloat @test_log10(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_log10:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl log10f
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_log10:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_log10:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl log10f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.log10.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2148,25 +1749,16 @@ define bfloat @test_log2(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_log2:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl log2f
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_log2:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_log2:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl log2f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.log2.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2191,49 +1783,30 @@ define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_fma:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h2 killed $h2 def $d2
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fmadd s0, s0, s1, s2
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fma:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmadd h0, h0, h1, h2
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_fma:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmadd s0, s0, s1, s2
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c)
   ret bfloat %r
 }
 
 define bfloat @test_fabs(bfloat %a) #0 {
-; CHECK-CVT-LABEL: test_fabs:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    and w8, w8, #0x7fff
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fabs:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-SD-NEXT:    fmov w8, s0
-; CHECK-SD-NEXT:    and w8, w8, #0x7fff
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fabs:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fabs h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fabs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w8, w8, #0x7fff
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
   %r = call bfloat @llvm.fabs.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2256,20 +1829,15 @@ define bfloat @test_minnum(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_minnum:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fminnm s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_minnum:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fminnm h0, h0, h1
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_minnum:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fminnm s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.minnum.f16(bfloat %a, bfloat %b)
   ret bfloat %r
 }
@@ -2292,20 +1860,15 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_maxnum:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fmaxnm s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_maxnum:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmaxnm h0, h0, h1
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_maxnum:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmaxnm s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b)
   ret bfloat %r
 }
@@ -2338,11 +1901,11 @@ define bfloat @test_copysign(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-GI-LABEL: test_copysign:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mvni v2.4h, #128, lsl #8
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-GI-NEXT:    mvni v2.8h, #128, lsl #8
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-NEXT:    // kill: def $h1 killed $h1 def $q1
+; CHECK-GI-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
   ret bfloat %r
@@ -2488,18 +2051,13 @@ define bfloat @test_floor(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_floor:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    frintm s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_floor:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    frintm h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_floor:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    frintm s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.floor.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2520,18 +2078,13 @@ define bfloat @test_ceil(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_ceil:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    frintp s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_ceil:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    frintp h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_ceil:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    frintp s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.ceil.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2552,18 +2105,13 @@ define bfloat @test_trunc(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_trunc:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    frintz s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_trunc:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    frintz h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_trunc:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    frintz s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.trunc.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2584,18 +2132,13 @@ define bfloat @test_rint(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_rint:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    frintx s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_rint:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    frintx h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_rint:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    frintx s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.rint.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2616,18 +2159,13 @@ define bfloat @test_nearbyint(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_nearbyint:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    frinti s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_nearbyint:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    frinti h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_nearbyint:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    frinti s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.nearbyint.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2648,18 +2186,13 @@ define bfloat @test_round(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_round:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    frinta s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_round:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    frinta h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_round:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    frinta s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.round.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2680,18 +2213,13 @@ define bfloat @test_roundeven(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_roundeven:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    frintn s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_roundeven:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    frintn h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_roundeven:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    frintn s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.roundeven.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2724,26 +2252,20 @@ define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_fmuladd:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    // kill: def $h2 killed $h2 def $d2
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fmul s0, s0, s1
-; CHECK-SD-NEXT:    shll v1.4s, v2.4h, #16
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fadd s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fmuladd:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmul h0, h0, h1
-; CHECK-GI-NEXT:    fadd h0, h0, h2
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_fmuladd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul s0, s0, s1
+; CHECK-BF16-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c)
   ret bfloat %r
 }
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index 0daa6e7f16202a..da46dff7f63383 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for concat_high_high_v8bf16
 
 define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) {
 ; CHECK-SD-LABEL: concat1:
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index bfc0ef0826f682..ed6fde55c36016 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -5,6 +5,21 @@
 ; CHECK-GI:       warning: Instruction selection used fallback path for dup_v2i8
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for duplane0_v2i8
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for loaddup_v2i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for dup_v2bfloat
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for duplane0_v2bfloat
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for loaddup_v2bfloat
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for dup_v3bfloat
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for duplane0_v3bfloat
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for loaddup_v3bfloat
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for dup_v4bfloat
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for duplane0_v4bfloat
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for loaddup_v4bfloat
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for dup_v8bfloat
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for duplane0_v8bfloat
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for loaddup_v8bfloat
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for dup_v16bfloat
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for duplane0_v16bfloat
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for loaddup_v16bfloat
 
 define <2 x i8> @dup_v2i8(i8 %a) {
 ; CHECK-LABEL: dup_v2i8:
@@ -1248,20 +1263,12 @@ entry:
 }
 
 define <16 x bfloat> @dup_v16bfloat(bfloat %a) {
-; CHECK-SD-LABEL: dup_v16bfloat:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-SD-NEXT:    dup v0.8h, v0.h[0]
-; CHECK-SD-NEXT:    mov v1.16b, v0.16b
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: dup_v16bfloat:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $q0
-; CHECK-GI-NEXT:    dup v2.8h, v0.h[0]
-; CHECK-GI-NEXT:    dup v1.8h, v0.h[0]
-; CHECK-GI-NEXT:    mov v0.16b, v2.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: dup_v16bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
 entry:
   %b = insertelement <16 x bfloat> poison, bfloat %a, i64 0
   %c = shufflevector <16 x bfloat> %b, <16 x bfloat> poison, <16 x i32> zeroinitializer
@@ -1280,17 +1287,11 @@ entry:
 }
 
 define <16 x bfloat> @loaddup_v16bfloat(ptr %p) {
-; CHECK-SD-LABEL: loaddup_v16bfloat:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1r { v0.8h }, [x0]
-; CHECK-SD-NEXT:    mov v1.16b, v0.16b
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: loaddup_v16bfloat:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ld1r { v0.8h }, [x0]
-; CHECK-GI-NEXT:    ld1r { v1.8h }, [x0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: loaddup_v16bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ld1r { v0.8h }, [x0]
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    ret
 entry:
   %a = load bfloat, ptr %p
   %b = insertelement <16 x bfloat> poison, bfloat %a, i64 0
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index 2187717c4148ae..b4c38e9f2df3b2 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -mtriple=aarch64 -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64 -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -global-isel=1 -mattr=+fullfp16,+bf16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define float @fptrunc_f64_f32(double %a) {
 ; CHECK-LABEL: fptrunc_f64_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll
index 83912b1e77db20..6e072e50ebbb11 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
 define i8 @atomic_load_flat_monotonic_i8(ptr %ptr) {
 ; GCN-LABEL: atomic_load_flat_monotonic_i8:
@@ -109,12 +109,27 @@ define half @atomic_load_flat_monotonic_f16(ptr %ptr) {
 }
 
 define bfloat @atomic_load_flat_monotonic_bf16(ptr %ptr) {
-; GCN-LABEL: atomic_load_flat_monotonic_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    flat_load_ushort v0, v[0:1] glc
-; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: atomic_load_flat_monotonic_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_ushort v0, v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: atomic_load_flat_monotonic_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_flat_monotonic_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_ushort v0, v[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %load = load atomic bfloat, ptr %ptr monotonic, align 2
   ret bfloat %load
 }
@@ -133,12 +148,28 @@ define i32 @atomic_load_flat_monotonic_f16_zext_to_i32(ptr %ptr) {
 }
 
 define i32 @atomic_load_flat_monotonic_bf16_zext_to_i32(ptr %ptr) {
-; GCN-LABEL: atomic_load_flat_monotonic_bf16_zext_to_i32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    flat_load_ushort v0, v[0:1] glc
-; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: atomic_load_flat_monotonic_bf16_zext_to_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_ushort v0, v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: atomic_load_flat_monotonic_bf16_zext_to_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: atomic_load_flat_monotonic_bf16_zext_to_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_ushort v0, v[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %load = load atomic bfloat, ptr %ptr monotonic, align 2
   %cast = bitcast bfloat %load to i16
   %ext = zext i16 %cast to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
index e2906c3d4fdb24..85c1c31813afa3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
 define i8 @atomic_load_global_monotonic_i8(ptr addrspace(1) %ptr) {
 ; GFX6-LABEL: atomic_load_global_monotonic_i8:
@@ -331,9 +331,11 @@ define bfloat @atomic_load_global_monotonic_bf16(ptr addrspace(1) %ptr) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: atomic_load_global_monotonic_bf16:
@@ -341,6 +343,7 @@ define bfloat @atomic_load_global_monotonic_bf16(ptr addrspace(1) %ptr) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_load_ushort v0, v[0:1] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: atomic_load_global_monotonic_bf16:
@@ -403,7 +406,8 @@ define i32 @atomic_load_global_monotonic_bf16_zext_to_i32(ptr addrspace(1) %ptr)
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0x100f000
-; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -420,6 +424,7 @@ define i32 @atomic_load_global_monotonic_bf16_zext_to_i32(ptr addrspace(1) %ptr)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: atomic_load_global_monotonic_bf16_zext_to_i32:
@@ -427,6 +432,7 @@ define i32 @atomic_load_global_monotonic_bf16_zext_to_i32(ptr addrspace(1) %ptr)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %load = load atomic bfloat, ptr addrspace(1) %ptr monotonic, align 2
   %cast = bitcast bfloat %load to i16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll
index 1656814d6fb06b..9d7cc1889928f8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
 ; TODO: Merge with atomic_load_local.ll
 
@@ -261,6 +261,7 @@ define bfloat @atomic_load_local_monotonic_bf16(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_u16 v0, v0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: atomic_load_local_monotonic_bf16:
@@ -325,6 +326,7 @@ define i32 @atomic_load_local_monotonic_bf16_zext_to_i32(ptr addrspace(3) %ptr)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_u16 v0, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: atomic_load_local_monotonic_bf16_zext_to_i32:
@@ -332,6 +334,7 @@ define i32 @atomic_load_local_monotonic_bf16_zext_to_i32(ptr addrspace(3) %ptr)
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_u16 v0, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %load = load atomic bfloat, ptr addrspace(3) %ptr monotonic, align 2
   %cast = bitcast bfloat %load to i16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll
index 3206f8e55f44eb..5991f18e1f6dea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll
@@ -1,11 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -global-isel -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
+; RUN: llc < %s -global-isel -global-isel-abort=2 -stop-after=irtranslator -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
 
 ; tests bf16 argument & return values lowering.
+; BF16 is currently expected to fall-back to SDAG.
 
 define <3 x bfloat> @v3bf16(<3 x bfloat> %arg0) {
   ; GFX9-LABEL: name: v3bf16
-  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9: bb.0:
+  ; GFX9-NEXT:   successors: %bb.1(0x80000000)
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -18,22 +20,16 @@ define <3 x bfloat> @v3bf16(<3 x bfloat> %arg0) {
   ; GFX9-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
   ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
   ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<3 x s16>) = G_TRUNC [[BUILD_VECTOR]](<3 x s32>)
-  ; GFX9-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
-  ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16)
-  ; GFX9-NEXT:   [[SHUF:%[0-9]+]]:_(<3 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<3 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2)
-  ; GFX9-NEXT:   [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<3 x s16>)
-  ; GFX9-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
-  ; GFX9-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT4]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT5]](s32)
-  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.0):
   %res = shufflevector <3 x bfloat> %arg0, <3 x bfloat> zeroinitializer, <3 x i32> <i32 3, i32 1, i32 2>
   ret <3 x bfloat> %res
 }
 
 define <4 x bfloat> @v4bf16(<4 x bfloat> %arg0) {
   ; GFX9-LABEL: name: v4bf16
-  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9: bb.0:
+  ; GFX9-NEXT:   successors: %bb.1(0x80000000)
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -46,22 +42,16 @@ define <4 x bfloat> @v4bf16(<4 x bfloat> %arg0) {
   ; GFX9-NEXT:   [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
   ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32)
   ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
-  ; GFX9-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
-  ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
-  ; GFX9-NEXT:   [[SHUF:%[0-9]+]]:_(<4 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<4 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0)
-  ; GFX9-NEXT:   [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<4 x s16>)
-  ; GFX9-NEXT:   [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
-  ; GFX9-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT4]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT5]](s32)
-  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.0):
   %res = shufflevector <4 x bfloat> %arg0, <4 x bfloat> zeroinitializer, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
   ret <4 x bfloat> %res
 }
 
 define <5 x bfloat> @v5bf16(<5 x bfloat> %arg0) {
   ; GFX9-LABEL: name: v5bf16
-  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9: bb.0:
+  ; GFX9-NEXT:   successors: %bb.1(0x80000000)
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -78,24 +68,16 @@ define <5 x bfloat> @v5bf16(<5 x bfloat> %arg0) {
   ; GFX9-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
   ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32)
   ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<5 x s16>) = G_TRUNC [[BUILD_VECTOR]](<5 x s32>)
-  ; GFX9-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
-  ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<5 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
-  ; GFX9-NEXT:   [[SHUF:%[0-9]+]]:_(<5 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<5 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4)
-  ; GFX9-NEXT:   [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<5 x s16>)
-  ; GFX9-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
-  ; GFX9-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
-  ; GFX9-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT6]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT7]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[ANYEXT8]](s32)
-  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.0):
   %res = shufflevector <5 x bfloat> %arg0, <5 x bfloat> zeroinitializer, <5 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4>
   ret <5 x bfloat> %res
 }
 
 define <6 x bfloat> @v6bf16(<6 x bfloat> %arg0) {
   ; GFX9-LABEL: name: v6bf16
-  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9: bb.0:
+  ; GFX9-NEXT:   successors: %bb.1(0x80000000)
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -112,24 +94,16 @@ define <6 x bfloat> @v6bf16(<6 x bfloat> %arg0) {
   ; GFX9-NEXT:   [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
   ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32)
   ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<6 x s16>) = G_TRUNC [[BUILD_VECTOR]](<6 x s32>)
-  ; GFX9-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
-  ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
-  ; GFX9-NEXT:   [[SHUF:%[0-9]+]]:_(<6 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<6 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5)
-  ; GFX9-NEXT:   [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<6 x s16>)
-  ; GFX9-NEXT:   [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
-  ; GFX9-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
-  ; GFX9-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT6]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT7]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[ANYEXT8]](s32)
-  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.0):
   %res = shufflevector <6 x bfloat> %arg0, <6 x bfloat> zeroinitializer, <6 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5>
   ret <6 x bfloat> %res
 }
 
 define <7 x bfloat> @v7bf16(<7 x bfloat> %arg0) {
   ; GFX9-LABEL: name: v7bf16
-  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9: bb.0:
+  ; GFX9-NEXT:   successors: %bb.1(0x80000000)
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -150,26 +124,16 @@ define <7 x bfloat> @v7bf16(<7 x bfloat> %arg0) {
   ; GFX9-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
   ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32)
   ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<7 x s16>) = G_TRUNC [[BUILD_VECTOR]](<7 x s32>)
-  ; GFX9-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
-  ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<7 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
-  ; GFX9-NEXT:   [[SHUF:%[0-9]+]]:_(<7 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<7 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5, 6)
-  ; GFX9-NEXT:   [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<7 x s16>)
-  ; GFX9-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
-  ; GFX9-NEXT:   [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16)
-  ; GFX9-NEXT:   [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16)
-  ; GFX9-NEXT:   [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT8]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT9]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[ANYEXT10]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[ANYEXT11]](s32)
-  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.0):
   %res = shufflevector <7 x bfloat> %arg0, <7 x bfloat> zeroinitializer, <7 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6>
   ret <7 x bfloat> %res
 }
 
 define <8 x bfloat> @v8bf16(<8 x bfloat> %arg0) {
   ; GFX9-LABEL: name: v8bf16
-  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9: bb.0:
+  ; GFX9-NEXT:   successors: %bb.1(0x80000000)
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -190,26 +154,16 @@ define <8 x bfloat> @v8bf16(<8 x bfloat> %arg0) {
   ; GFX9-NEXT:   [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
   ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32)
   ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<8 x s16>) = G_TRUNC [[BUILD_VECTOR]](<8 x s32>)
-  ; GFX9-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
-  ; GFX9-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
-  ; GFX9-NEXT:   [[SHUF:%[0-9]+]]:_(<8 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<8 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5, 6, 7)
-  ; GFX9-NEXT:   [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16), [[UV15:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<8 x s16>)
-  ; GFX9-NEXT:   [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
-  ; GFX9-NEXT:   [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16)
-  ; GFX9-NEXT:   [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16)
-  ; GFX9-NEXT:   [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT8]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT9]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[ANYEXT10]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[ANYEXT11]](s32)
-  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.0):
   %res = shufflevector <8 x bfloat> %arg0, <8 x bfloat> zeroinitializer, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   ret <8 x bfloat> %res
 }
 
 define <16 x bfloat> @v16bf16(<16 x bfloat> %arg0) {
   ; GFX9-LABEL: name: v16bf16
-  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9: bb.0:
+  ; GFX9-NEXT:   successors: %bb.1(0x80000000)
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -246,30 +200,15 @@ define <16 x bfloat> @v16bf16(<16 x bfloat> %arg0) {
   ; GFX9-NEXT:   [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s16)
   ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32), [[ANYEXT8]](s32), [[ANYEXT9]](s32), [[ANYEXT10]](s32), [[ANYEXT11]](s32), [[ANYEXT12]](s32), [[ANYEXT13]](s32), [[ANYEXT14]](s32), [[ANYEXT15]](s32)
   ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<16 x s16>) = G_TRUNC [[BUILD_VECTOR]](<16 x s32>)
-  ; GFX9-NEXT:   [[UV16:%[0-9]+]]:_(s16), [[UV17:%[0-9]+]]:_(s16), [[UV18:%[0-9]+]]:_(s16), [[UV19:%[0-9]+]]:_(s16), [[UV20:%[0-9]+]]:_(s16), [[UV21:%[0-9]+]]:_(s16), [[UV22:%[0-9]+]]:_(s16), [[UV23:%[0-9]+]]:_(s16), [[UV24:%[0-9]+]]:_(s16), [[UV25:%[0-9]+]]:_(s16), [[UV26:%[0-9]+]]:_(s16), [[UV27:%[0-9]+]]:_(s16), [[UV28:%[0-9]+]]:_(s16), [[UV29:%[0-9]+]]:_(s16), [[UV30:%[0-9]+]]:_(s16), [[UV31:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[TRUNC]](<16 x s16>)
-  ; GFX9-NEXT:   [[ANYEXT16:%[0-9]+]]:_(s32) = G_ANYEXT [[UV16]](s16)
-  ; GFX9-NEXT:   [[ANYEXT17:%[0-9]+]]:_(s32) = G_ANYEXT [[UV17]](s16)
-  ; GFX9-NEXT:   [[ANYEXT18:%[0-9]+]]:_(s32) = G_ANYEXT [[UV18]](s16)
-  ; GFX9-NEXT:   [[ANYEXT19:%[0-9]+]]:_(s32) = G_ANYEXT [[UV19]](s16)
-  ; GFX9-NEXT:   [[ANYEXT20:%[0-9]+]]:_(s32) = G_ANYEXT [[UV20]](s16)
-  ; GFX9-NEXT:   [[ANYEXT21:%[0-9]+]]:_(s32) = G_ANYEXT [[UV21]](s16)
-  ; GFX9-NEXT:   [[ANYEXT22:%[0-9]+]]:_(s32) = G_ANYEXT [[UV22]](s16)
-  ; GFX9-NEXT:   [[ANYEXT23:%[0-9]+]]:_(s32) = G_ANYEXT [[UV23]](s16)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT16]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT17]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[ANYEXT18]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[ANYEXT19]](s32)
-  ; GFX9-NEXT:   $vgpr4 = COPY [[ANYEXT20]](s32)
-  ; GFX9-NEXT:   $vgpr5 = COPY [[ANYEXT21]](s32)
-  ; GFX9-NEXT:   $vgpr6 = COPY [[ANYEXT22]](s32)
-  ; GFX9-NEXT:   $vgpr7 = COPY [[ANYEXT23]](s32)
-  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.0):
   ret <16 x bfloat> %arg0
 }
 
 define <32 x bfloat> @v32bf16(<32 x bfloat> %arg0) {
   ; GFX9-LABEL: name: v32bf16
-  ; GFX9: bb.1 (%ir-block.0):
+  ; GFX9: bb.0:
+  ; GFX9-NEXT:   successors: %bb.1(0x80000000)
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -338,39 +277,7 @@ define <32 x bfloat> @v32bf16(<32 x bfloat> %arg0) {
   ; GFX9-NEXT:   [[ANYEXT31:%[0-9]+]]:_(s32) = G_ANYEXT [[UV31]](s16)
   ; GFX9-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32), [[ANYEXT8]](s32), [[ANYEXT9]](s32), [[ANYEXT10]](s32), [[ANYEXT11]](s32), [[ANYEXT12]](s32), [[ANYEXT13]](s32), [[ANYEXT14]](s32), [[ANYEXT15]](s32), [[ANYEXT16]](s32), [[ANYEXT17]](s32), [[ANYEXT18]](s32), [[ANYEXT19]](s32), [[ANYEXT20]](s32), [[ANYEXT21]](s32), [[ANYEXT22]](s32), [[ANYEXT23]](s32), [[ANYEXT24]](s32), [[ANYEXT25]](s32), [[ANYEXT26]](s32), [[ANYEXT27]](s32), [[ANYEXT28]](s32), [[ANYEXT29]](s32), [[ANYEXT30]](s32), [[ANYEXT31]](s32)
   ; GFX9-NEXT:   [[TRUNC:%[0-9]+]]:_(<32 x s16>) = G_TRUNC [[BUILD_VECTOR]](<32 x s32>)
-  ; GFX9-NEXT:   [[UV32:%[0-9]+]]:_(s16), [[UV33:%[0-9]+]]:_(s16), [[UV34:%[0-9]+]]:_(s16), [[UV35:%[0-9]+]]:_(s16), [[UV36:%[0-9]+]]:_(s16), [[UV37:%[0-9]+]]:_(s16), [[UV38:%[0-9]+]]:_(s16), [[UV39:%[0-9]+]]:_(s16), [[UV40:%[0-9]+]]:_(s16), [[UV41:%[0-9]+]]:_(s16), [[UV42:%[0-9]+]]:_(s16), [[UV43:%[0-9]+]]:_(s16), [[UV44:%[0-9]+]]:_(s16), [[UV45:%[0-9]+]]:_(s16), [[UV46:%[0-9]+]]:_(s16), [[UV47:%[0-9]+]]:_(s16), [[UV48:%[0-9]+]]:_(s16), [[UV49:%[0-9]+]]:_(s16), [[UV50:%[0-9]+]]:_(s16), [[UV51:%[0-9]+]]:_(s16), [[UV52:%[0-9]+]]:_(s16), [[UV53:%[0-9]+]]:_(s16), [[UV54:%[0-9]+]]:_(s16), [[UV55:%[0-9]+]]:_(s16), [[UV56:%[0-9]+]]:_(s16), [[UV57:%[0-9]+]]:_(s16), [[UV58:%[0-9]+]]:_(s16), [[UV59:%[0-9]+]]:_(s16), [[UV60:%[0-9]+]]:_(s16), [[UV61:%[0-9]+]]:_(s16), [[UV62:%[0-9]+]]:_(s16), [[UV63:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[TRUNC]](<32 x s16>)
-  ; GFX9-NEXT:   [[ANYEXT32:%[0-9]+]]:_(s32) = G_ANYEXT [[UV32]](s16)
-  ; GFX9-NEXT:   [[ANYEXT33:%[0-9]+]]:_(s32) = G_ANYEXT [[UV33]](s16)
-  ; GFX9-NEXT:   [[ANYEXT34:%[0-9]+]]:_(s32) = G_ANYEXT [[UV34]](s16)
-  ; GFX9-NEXT:   [[ANYEXT35:%[0-9]+]]:_(s32) = G_ANYEXT [[UV35]](s16)
-  ; GFX9-NEXT:   [[ANYEXT36:%[0-9]+]]:_(s32) = G_ANYEXT [[UV36]](s16)
-  ; GFX9-NEXT:   [[ANYEXT37:%[0-9]+]]:_(s32) = G_ANYEXT [[UV37]](s16)
-  ; GFX9-NEXT:   [[ANYEXT38:%[0-9]+]]:_(s32) = G_ANYEXT [[UV38]](s16)
-  ; GFX9-NEXT:   [[ANYEXT39:%[0-9]+]]:_(s32) = G_ANYEXT [[UV39]](s16)
-  ; GFX9-NEXT:   [[ANYEXT40:%[0-9]+]]:_(s32) = G_ANYEXT [[UV40]](s16)
-  ; GFX9-NEXT:   [[ANYEXT41:%[0-9]+]]:_(s32) = G_ANYEXT [[UV41]](s16)
-  ; GFX9-NEXT:   [[ANYEXT42:%[0-9]+]]:_(s32) = G_ANYEXT [[UV42]](s16)
-  ; GFX9-NEXT:   [[ANYEXT43:%[0-9]+]]:_(s32) = G_ANYEXT [[UV43]](s16)
-  ; GFX9-NEXT:   [[ANYEXT44:%[0-9]+]]:_(s32) = G_ANYEXT [[UV44]](s16)
-  ; GFX9-NEXT:   [[ANYEXT45:%[0-9]+]]:_(s32) = G_ANYEXT [[UV45]](s16)
-  ; GFX9-NEXT:   [[ANYEXT46:%[0-9]+]]:_(s32) = G_ANYEXT [[UV46]](s16)
-  ; GFX9-NEXT:   [[ANYEXT47:%[0-9]+]]:_(s32) = G_ANYEXT [[UV47]](s16)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[ANYEXT32]](s32)
-  ; GFX9-NEXT:   $vgpr1 = COPY [[ANYEXT33]](s32)
-  ; GFX9-NEXT:   $vgpr2 = COPY [[ANYEXT34]](s32)
-  ; GFX9-NEXT:   $vgpr3 = COPY [[ANYEXT35]](s32)
-  ; GFX9-NEXT:   $vgpr4 = COPY [[ANYEXT36]](s32)
-  ; GFX9-NEXT:   $vgpr5 = COPY [[ANYEXT37]](s32)
-  ; GFX9-NEXT:   $vgpr6 = COPY [[ANYEXT38]](s32)
-  ; GFX9-NEXT:   $vgpr7 = COPY [[ANYEXT39]](s32)
-  ; GFX9-NEXT:   $vgpr8 = COPY [[ANYEXT40]](s32)
-  ; GFX9-NEXT:   $vgpr9 = COPY [[ANYEXT41]](s32)
-  ; GFX9-NEXT:   $vgpr10 = COPY [[ANYEXT42]](s32)
-  ; GFX9-NEXT:   $vgpr11 = COPY [[ANYEXT43]](s32)
-  ; GFX9-NEXT:   $vgpr12 = COPY [[ANYEXT44]](s32)
-  ; GFX9-NEXT:   $vgpr13 = COPY [[ANYEXT45]](s32)
-  ; GFX9-NEXT:   $vgpr14 = COPY [[ANYEXT46]](s32)
-  ; GFX9-NEXT:   $vgpr15 = COPY [[ANYEXT47]](s32)
-  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
+  ; GFX9-NEXT: {{  $}}
+  ; GFX9-NEXT: bb.1 (%ir-block.0):
   ret <32 x bfloat> %arg0
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 7691f4c30de04a..78f33a174980d3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel -global-isel-abort=2 -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
 
 declare hidden void @external_void_func_void() #0
 
@@ -5594,48 +5594,14 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
 
 define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_bf16_inreg
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr16
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr16
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_bf16_inreg
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
-  ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
-  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_bf16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
-  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1 (%ir-block.0):
   call void @external_void_func_bf16_inreg(bfloat inreg %arg)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 6d32d4c720c991..9307126f5f68b8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -2,7 +2,7 @@
 ; Note update_mir_test_checks does not support generating checks for
 ; the frame info, so some functions have manually added stack object
 ; checks.
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel -global-isel-abort=2 -verify-machineinstrs -o - %s | FileCheck %s
 ; FIXME: pre-VI should have same ABI without legal i16 operations.
 
 define void @void_func_empty_arg({} %arg0, i32 %arg1) #0 {
@@ -97,8 +97,8 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[TRUNC]], [[C]]
-  ; CHECK-NEXT:   [[INTRINSIC_W_SIDE_EFFECTS:%[0-9]+]]:_(s1), [[INTRINSIC_W_SIDE_EFFECTS1:%[0-9]+]]:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1)
-  ; CHECK-NEXT:   G_BRCOND [[INTRINSIC_W_SIDE_EFFECTS]](s1), %bb.2
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(s1), [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[XOR]](s1)
+  ; CHECK-NEXT:   G_BRCOND [[INT]](s1), %bb.2
   ; CHECK-NEXT:   G_BR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.bb1:
@@ -108,7 +108,7 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
   ; CHECK-NEXT:   G_BR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.bb2:
-  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INTRINSIC_W_SIDE_EFFECTS1]](s64)
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT1]](s64)
   ; CHECK-NEXT:   SI_RETURN
 bb:
   br i1 %arg, label %bb2, label %bb1
@@ -2913,14 +2913,14 @@ define void @void_func_f16_inreg(half inreg %arg0) #0 {
 
 define void @void_func_bf16_inreg(bfloat inreg %arg0) #0 {
   ; CHECK-LABEL: name: void_func_bf16_inreg
-  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $sgpr16
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   G_STORE [[TRUNC]](s16), [[DEF]](p1) :: (store (s16) into `ptr addrspace(1) undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1 (%ir-block.0):
   store bfloat %arg0, ptr addrspace(1) undef
   ret void
 }
@@ -3018,14 +3018,14 @@ define void @void_func_v2f16_inreg(<2 x half> inreg %arg0) #0 {
 
 define void @void_func_v2bf16_inreg(<2 x bfloat> inreg %arg0) #0 {
   ; CHECK-LABEL: name: void_func_v2bf16_inreg
-  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $sgpr16
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
   ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY]](s32)
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   G_STORE [[BITCAST]](<2 x s16>), [[DEF]](p1) :: (store (<2 x s16>) into `ptr addrspace(1) undef`, addrspace 1)
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1 (%ir-block.0):
   store <2 x bfloat> %arg0, ptr addrspace(1) undef
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 73b891e43de99e..ee89b28a0d2bb1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: set_inactive:
@@ -284,17 +284,15 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
-; GCN-NEXT:    s_mov_b64 exec, s[2:3]
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s6
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
@@ -359,21 +357,23 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
 ; GCN-LABEL: set_inactive_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v3, s2
-; GCN-NEXT:    v_mov_b32_e32 v4, s3
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v3, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v4, s[4:5]
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    v_mov_b32_e32 v4, v2
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[3:4], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
   %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0)
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index cdfaed0a203e92..31a8635785066a 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefix=GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL %s
 
 ; Note: if you're adding tests here, also add them to
 ; lower-buffer-fat-pointers-contents-legalization.ll to verify the IR produced by
@@ -629,7 +629,6 @@ define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <4 x bfloat>, ptr addrspace(7) %p
@@ -647,10 +646,6 @@ define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: store_v4bf16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GISEL-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index b128be2186df29..935ae48654b648 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
 
 define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX7-LABEL: fmul_select_f32_test1:
@@ -2541,114 +2541,72 @@ define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.ar
 }
 
 define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test1:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test1:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3f80
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x4000
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 2.000000e+00, bfloat 1.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2656,114 +2614,72 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test2:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test2:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test2:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3f80
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3f00
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test2:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test2:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3f00
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test2:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3f00
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test2:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x3f00
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test2:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 5.000000e-01, bfloat 1.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2771,158 +2687,111 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_v2bf16_test3:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_v2bf16_test3:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_v2bf16_test3:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f80
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x4000
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-SDAG-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_v2bf16_test3:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_v2bf16_test3:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_v2bf16_test3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_v2bf16_test3:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_v2bf16_test3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_v2bf16_test3:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2bf16_test3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_v2bf16_test3:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_v2bf16_test3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 2.000000e+00, bfloat 2.000000e+00>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
   %ldexp = fmul <2 x bfloat> %x, %y
@@ -2930,267 +2799,185 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
 }
 
 define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_v2bf16_test4:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_v2bf16_test4:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_v2bf16_test4:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f80
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3f00
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-SDAG-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_v2bf16_test4:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_v2bf16_test4:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f00
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_v2bf16_test4:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_v2bf16_test4:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_v2bf16_test4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3f00
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_v2bf16_test4:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f00
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2bf16_test4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x3f00
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_v2bf16_test4:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_v2bf16_test4:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x3f00
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 5.000000e-01, bfloat 5.000000e-01>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
   %ldexp = fmul <2 x bfloat> %x, %y
   ret <2 x bfloat> %ldexp
 }
-
-define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test5:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, 2.0, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test5:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test5:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4100
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test5:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test5:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+
+define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_bf16_test5:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v3, 2.0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test5:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test5:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x4000
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test5:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 2.000000e+00, bfloat 8.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -3198,116 +2985,74 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test6:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x40400000
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc1000000
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test6:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc100
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4040
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test6:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4040
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffc100
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test6:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc100
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4040
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test6:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc100
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test6:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test6:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4040
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test6:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4040
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffc100
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test6:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test6:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffffc100
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test6:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4040
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test6:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -8.000000e+00, bfloat 3.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -3315,115 +3060,73 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test7:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test7:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4100
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc080
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test7:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc080
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4100
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test7:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4100
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc080
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test7:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4100
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test7:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test7:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc080
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffc080
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4100
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test7:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test7:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc080
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test7:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 8.000000e+00, bfloat -4.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -3431,111 +3134,73 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test8:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test8:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x8000
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test8:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 15
-; GFX9-SDAG-NEXT:    v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test8:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x8000
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test8:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b16 v1, 15, v1
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test8:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test8:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test8:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 15, v1
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test8:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b16 v1, 15, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_bf16_test8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b16 v1, 15, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -0.000000e+00, bfloat 0.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -3543,121 +3208,74 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test9:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc2000000
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc1800000
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test9:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 5, v1
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test9:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc200
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffc180
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test9:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 5, v1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test9:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc180
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test9:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0xc2000000
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1800000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test9:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
-; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test9:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffc200
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffc180
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test9:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test9:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffffc180
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test9:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test9:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -1.600000e+01, bfloat -3.200000e+01
   %ldexp = fmul bfloat %x, %y
@@ -3665,111 +3283,74 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0xdb800000
-; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v4, 7
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffdb80
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffe000
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffe000
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0xdb800000
+; GFX7-NEXT:    v_bfrev_b32_e32 v4, 7
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffdb80
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffe000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffffe000
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 0xRE000, bfloat 0xRDB80
   %ldexp = fmul bfloat %x, %y
@@ -3777,111 +3358,74 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
 }
 
 define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v3, 50
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0x34800000
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_not_b32_e32 v3, 21
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 25, v3, vcc
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4c00
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3480
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_not_b32_e32 v3, 21
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 25, v3, vcc
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3480
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_bfrev_b32_e32 v3, 50
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0x34800000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4c00
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3480
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x3480
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 0xR3480, bfloat 0xR4C00
   %ldexp = fmul bfloat %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
index a0578756433ff8..62f16fe2760ef2 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 | FileCheck %s -check-prefix=GFX12-SDAG
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 | FileCheck %s -check-prefix=GFX12-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 -global-isel-abort=2 | FileCheck %s -check-prefix=GFX12-GISEL
 
 declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg)
 declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg)
@@ -15,7 +15,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret_offset(<2 x half> %val,
 ;
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret_offset:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92
 ; GFX12-GISEL-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
   ret void
@@ -29,7 +29,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i
 ;
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen
 ; GFX12-GISEL-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -44,7 +44,7 @@ define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret_offset(<2 x half> %
 ;
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret_offset:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
@@ -60,7 +60,7 @@ define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4
 ;
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -76,7 +76,7 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x
 ;
 ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_ret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -92,7 +92,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4
 ;
 ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen
 ; GFX12-GISEL-NEXT:    s_endpgm
   %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll
index d8ea0ddf77b7a1..5d9944add13a37 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 | FileCheck %s -check-prefix=GFX950-SDAG
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 | FileCheck %s -check-prefix=GFX950-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 | FileCheck %s -check-prefix=GFX950-GISEL
 
 declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg)
 declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32)
@@ -20,9 +20,9 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <
 ;
 ; GFX950-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_ret:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v5, v2
-; GFX950-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[4:5], s[0:3], s4 idxen offen sc0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX950-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[2:3], s[0:3], s4 idxen offen sc0
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], 0
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    flat_store_dword v[2:3], v0
@@ -44,9 +44,9 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2bf16_noret(<2 x bfloat> %val,
 ;
 ; GFX950-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_noret:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v5, v2
-; GFX950-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[4:5], s[0:3], s4 idxen offen
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX950-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[2:3], s[0:3], s4 idxen offen
 ; GFX950-GISEL-NEXT:    s_endpgm
   %orig = call <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 3a7f3e41002d28..8ce50638fa58b9 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL %s
 
 define i128 @fptosi_f64_to_i128(double %x) {
 ; SDAG-LABEL: fptosi_f64_to_i128:
@@ -1477,699 +1477,239 @@ define i128 @fptoui_f16_to_i128(half %x) {
 }
 
 define i128 @fptosi_bf16_to_i128(bfloat %x) {
-; SDAG-LABEL: fptosi_bf16_to_i128:
-; SDAG:       ; %bb.0: ; %fp-to-i-entry
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v4, v0
-; SDAG-NEXT:    v_bfe_u32 v5, v4, 7, 8
-; SDAG-NEXT:    s_movk_i32 s4, 0x7e
-; SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-NEXT:    v_mov_b32_e32 v3, 0
-; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v5
-; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; SDAG-NEXT:    s_cbranch_execz .LBB6_10
-; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
-; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
-; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
-; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
-; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_mov_b32 s7, -1
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
-; SDAG-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v4
-; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[6:7]
-; SDAG-NEXT:    s_cbranch_execz .LBB6_7
-; SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
-; SDAG-NEXT:    s_movk_i32 s4, 0x7f
-; SDAG-NEXT:    v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT:    s_mov_b64 s[4:5], 0x85
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
-; SDAG-NEXT:    v_mov_b32_e32 v7, 0
-; SDAG-NEXT:    v_cndmask_b32_e64 v9, -1, 0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 1, vcc
-; SDAG-NEXT:    v_or_b32_e32 v6, 0x80, v0
-; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; SDAG-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
-; SDAG-NEXT:    s_cbranch_execz .LBB6_4
-; SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; SDAG-NEXT:    v_sub_u32_e32 v0, 0xc6, v5
-; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff3a, v5
-; SDAG-NEXT:    v_add_u32_e32 v4, 0xffffff7a, v5
-; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[6:7]
-; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
-; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v4
-; SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
-; SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
-; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v1, s[4:5]
-; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0
-; SDAG-NEXT:    v_mul_lo_u32 v13, v9, v2
-; SDAG-NEXT:    v_mul_lo_u32 v14, v8, v3
-; SDAG-NEXT:    v_mov_b32_e32 v6, v1
-; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7]
-; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v8, v5
-; SDAG-NEXT:    v_mov_b32_e32 v5, v7
-; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; SDAG-NEXT:    v_add_co_u32_e64 v6, s[4:5], -1, v10
-; SDAG-NEXT:    v_add3_u32 v3, v3, v14, v13
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v12, v[2:3]
-; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v8, v5
-; SDAG-NEXT:    v_mul_lo_u32 v3, v6, v11
-; SDAG-NEXT:    v_mul_lo_u32 v7, v6, v12
-; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6]
-; SDAG-NEXT:    v_add3_u32 v3, v7, v2, v3
-; SDAG-NEXT:    ; implicit-def: $vgpr8
-; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v1
-; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT:    ; implicit-def: $vgpr5_vgpr6
-; SDAG-NEXT:    v_mov_b32_e32 v1, v4
-; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
-; SDAG-NEXT:  .LBB6_4: ; %Flow
-; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[12:13]
-; SDAG-NEXT:    s_cbranch_execz .LBB6_6
-; SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
-; SDAG-NEXT:    v_sub_u32_e32 v2, 0x86, v5
-; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[6:7]
-; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v2
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[4:5]
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
-; SDAG-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v8
-; SDAG-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; SDAG-NEXT:    v_mul_i32_i24_e32 v0, v0, v8
-; SDAG-NEXT:    v_mov_b32_e32 v3, v2
-; SDAG-NEXT:  .LBB6_6: ; %Flow1
-; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT:  .LBB6_7: ; %Flow2
-; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
-; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
-; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
-; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; SDAG-NEXT:    v_mov_b32_e32 v1, v2
-; SDAG-NEXT:  ; %bb.9: ; %Flow3
-; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SDAG-NEXT:  .LBB6_10: ; %fp-to-i-cleanup
-; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: fptosi_bf16_to_i128:
-; GISEL:       ; %bb.0: ; %fp-to-i-entry
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v4, v0
-; GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v4
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0
-; GISEL-NEXT:    v_lshrrev_b64 v[0:1], 7, v[5:6]
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f
-; GISEL-NEXT:    s_mov_b64 s[4:5], 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-NEXT:    v_bfe_u32 v5, v0, 0, 8
-; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[5:6], v[1:2]
-; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GISEL-NEXT:    v_mov_b32_e32 v3, s7
-; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
-; GISEL-NEXT:    s_cbranch_execz .LBB6_10
-; GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
-; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffff80
-; GISEL-NEXT:    v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v3, -1
-; GISEL-NEXT:    v_addc_co_u32_e64 v7, s[6:7], 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_cmp_le_u64_e32 vcc, -1, v[7:8]
-; GISEL-NEXT:    v_cmp_lt_i16_e64 s[4:5], -1, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, -1, v[7:8]
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GISEL-NEXT:    s_xor_b64 s[14:15], exec, s[6:7]
-; GISEL-NEXT:    s_cbranch_execz .LBB6_7
-; GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
-; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
-; GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v7, 3, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v8, 4, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v9, 5, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v10, 6, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v11, 7, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v12, 8, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v13, 9, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v14, 10, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v15, 11, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v16, 12, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v17, 13, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v18, 14, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v19, 15, v0
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v7
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v7
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
-; GISEL-NEXT:    v_and_b32_e32 v11, 0xffff, v0
-; GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v11
-; GISEL-NEXT:    v_or3_b32 v9, v1, v0, 1
-; GISEL-NEXT:    v_or3_b32 v10, v11, v0, 0
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0x86
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    v_and_b32_e32 v2, 0x7f, v4
-; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[5:6], v[0:1]
-; GISEL-NEXT:    v_or_b32_e32 v7, 0x80, v2
-; GISEL-NEXT:    v_mov_b32_e32 v8, 0
-; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
-; GISEL-NEXT:    s_cbranch_execz .LBB6_4
-; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff7a, v5
-; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[7:8]
-; GISEL-NEXT:    v_add_u32_e32 v4, 0xffffff3a, v5
-; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
-; GISEL-NEXT:    v_lshl_or_b32 v11, v11, 16, v11
-; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[7:8]
-; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v4, v[7:8]
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v8, v11, 0
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v10, v[0:1]
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v8, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v8, v10, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
-; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v10, v[4:5]
-; GISEL-NEXT:    ; implicit-def: $vgpr5
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
-; GISEL-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GISEL-NEXT:    ; implicit-def: $vgpr9
-; GISEL-NEXT:  .LBB6_4: ; %Flow
-; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[16:17]
-; GISEL-NEXT:    s_cbranch_execz .LBB6_6
-; GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
-; GISEL-NEXT:    v_sub_co_u32_e32 v2, vcc, 0x86, v5
-; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v2, v[7:8]
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
-; GISEL-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v9
-; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GISEL-NEXT:    v_mul_i32_i24_e32 v0, v0, v9
-; GISEL-NEXT:    v_mov_b32_e32 v3, v2
-; GISEL-NEXT:  .LBB6_6: ; %Flow1
-; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GISEL-NEXT:  .LBB6_7: ; %Flow2
-; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[14:15]
-; GISEL-NEXT:    s_cbranch_execz .LBB6_9
-; GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
-; GISEL-NEXT:    v_or_b32_e32 v2, v1, v2
-; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
-; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
-; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
-; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
-; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
-; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
-; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
-; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
-; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
-; GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v13, v14
-; GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
-; GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v15, v16
-; GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
-; GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 19, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v17, v18
-; GISEL-NEXT:    v_or3_b32 v0, v0, v19, v3
-; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 20, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 21, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v19, v3
-; GISEL-NEXT:    v_or3_b32 v0, v0, v4, v5
-; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 22, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 23, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v4, v5
-; GISEL-NEXT:    v_or3_b32 v0, v0, v6, v7
-; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 24, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 25, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v6, v7
-; GISEL-NEXT:    v_or3_b32 v0, v0, v8, v9
-; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 26, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 27, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v8, v9
-; GISEL-NEXT:    v_or3_b32 v0, v0, v10, v11
-; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 28, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 29, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v10, v11
-; GISEL-NEXT:    v_or3_b32 v0, v0, v12, v13
-; GISEL-NEXT:    v_lshlrev_b32_e32 v14, 30, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v12, v13
-; GISEL-NEXT:    v_or3_b32 v0, v0, v14, v1
-; GISEL-NEXT:    v_or3_b32 v1, v2, v14, v1
-; GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
-; GISEL-NEXT:    v_mov_b32_e32 v2, v1
-; GISEL-NEXT:  .LBB6_9: ; %Flow3
-; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GISEL-NEXT:  .LBB6_10: ; %fp-to-i-cleanup
-; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fptosi_bf16_to_i128:
+; GCN:       ; %bb.0: ; %fp-to-i-entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v4, v0
+; GCN-NEXT:    v_bfe_u32 v5, v4, 7, 8
+; GCN-NEXT:    s_movk_i32 s4, 0x7e
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v5
+; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-NEXT:    s_cbranch_execz .LBB6_10
+; GCN-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
+; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; GCN-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
+; GCN-NEXT:    s_movk_i32 s6, 0xff7f
+; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
+; GCN-NEXT:    s_mov_b32 s7, -1
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
+; GCN-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v4
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_xor_b64 s[10:11], exec, s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB6_7
+; GCN-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GCN-NEXT:    s_movk_i32 s4, 0x7f
+; GCN-NEXT:    v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GCN-NEXT:    s_mov_b64 s[4:5], 0x85
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
+; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_cndmask_b32_e64 v9, -1, 0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v8, -1, 1, vcc
+; GCN-NEXT:    v_or_b32_e32 v6, 0x80, v0
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB6_4
+; GCN-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GCN-NEXT:    v_sub_u32_e32 v0, 0xc6, v5
+; GCN-NEXT:    v_add_u32_e32 v2, 0xffffff3a, v5
+; GCN-NEXT:    v_add_u32_e32 v4, 0xffffff7a, v5
+; GCN-NEXT:    v_lshrrev_b64 v[0:1], v0, v[6:7]
+; GCN-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
+; GCN-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
+; GCN-NEXT:    v_lshlrev_b64 v[0:1], v4, v[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v12, 0, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v11, 0, v1, s[4:5]
+; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0
+; GCN-NEXT:    v_mul_lo_u32 v13, v9, v2
+; GCN-NEXT:    v_mul_lo_u32 v14, v8, v3
+; GCN-NEXT:    v_mov_b32_e32 v6, v1
+; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7]
+; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v8, v5
+; GCN-NEXT:    v_mov_b32_e32 v5, v7
+; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GCN-NEXT:    v_add_co_u32_e64 v6, s[4:5], -1, v10
+; GCN-NEXT:    v_add3_u32 v3, v3, v14, v13
+; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v12, v[2:3]
+; GCN-NEXT:    v_add_co_u32_e64 v5, s[4:5], v8, v5
+; GCN-NEXT:    v_mul_lo_u32 v3, v6, v11
+; GCN-NEXT:    v_mul_lo_u32 v7, v6, v12
+; GCN-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
+; GCN-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6]
+; GCN-NEXT:    v_add3_u32 v3, v7, v2, v3
+; GCN-NEXT:    ; implicit-def: $vgpr8
+; GCN-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v1
+; GCN-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
+; GCN-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GCN-NEXT:    v_mov_b32_e32 v1, v4
+; GCN-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GCN-NEXT:  .LBB6_4: ; %Flow
+; GCN-NEXT:    s_andn2_saveexec_b64 s[6:7], s[12:13]
+; GCN-NEXT:    s_cbranch_execz .LBB6_6
+; GCN-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GCN-NEXT:    v_sub_u32_e32 v2, 0x86, v5
+; GCN-NEXT:    v_lshrrev_b64 v[0:1], v2, v[6:7]
+; GCN-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GCN-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v8
+; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GCN-NEXT:    v_mul_i32_i24_e32 v0, v0, v8
+; GCN-NEXT:    v_mov_b32_e32 v3, v2
+; GCN-NEXT:  .LBB6_6: ; %Flow1
+; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-NEXT:  .LBB6_7: ; %Flow2
+; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GCN-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GCN-NEXT:    v_bfrev_b32_e32 v0, 1
+; GCN-NEXT:    v_bfrev_b32_e32 v1, -2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-NEXT:  ; %bb.9: ; %Flow3
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:  .LBB6_10: ; %fp-to-i-cleanup
+; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cvt = fptosi bfloat %x to i128
   ret i128 %cvt
 }
 
 define i128 @fptoui_bf16_to_i128(bfloat %x) {
-; SDAG-LABEL: fptoui_bf16_to_i128:
-; SDAG:       ; %bb.0: ; %fp-to-i-entry
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v4, v0
-; SDAG-NEXT:    v_bfe_u32 v5, v4, 7, 8
-; SDAG-NEXT:    s_movk_i32 s4, 0x7e
-; SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; SDAG-NEXT:    v_mov_b32_e32 v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-NEXT:    v_mov_b32_e32 v3, 0
-; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v5
-; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; SDAG-NEXT:    s_cbranch_execz .LBB7_10
-; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
-; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
-; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
-; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
-; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
-; SDAG-NEXT:    s_mov_b32 s7, -1
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
-; SDAG-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v4
-; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[6:7]
-; SDAG-NEXT:    s_cbranch_execz .LBB7_7
-; SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
-; SDAG-NEXT:    s_movk_i32 s4, 0x7f
-; SDAG-NEXT:    v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDAG-NEXT:    s_mov_b64 s[4:5], 0x85
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
-; SDAG-NEXT:    v_mov_b32_e32 v7, 0
-; SDAG-NEXT:    v_cndmask_b32_e64 v9, -1, 0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 1, vcc
-; SDAG-NEXT:    v_or_b32_e32 v6, 0x80, v0
-; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; SDAG-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
-; SDAG-NEXT:    s_cbranch_execz .LBB7_4
-; SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; SDAG-NEXT:    v_sub_u32_e32 v0, 0xc6, v5
-; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff3a, v5
-; SDAG-NEXT:    v_add_u32_e32 v4, 0xffffff7a, v5
-; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[6:7]
-; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
-; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v4
-; SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
-; SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
-; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v1, s[4:5]
-; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0
-; SDAG-NEXT:    v_mul_lo_u32 v13, v9, v2
-; SDAG-NEXT:    v_mul_lo_u32 v14, v8, v3
-; SDAG-NEXT:    v_mov_b32_e32 v6, v1
-; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7]
-; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0
-; SDAG-NEXT:    v_mov_b32_e32 v8, v5
-; SDAG-NEXT:    v_mov_b32_e32 v5, v7
-; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; SDAG-NEXT:    v_add_co_u32_e64 v6, s[4:5], -1, v10
-; SDAG-NEXT:    v_add3_u32 v3, v3, v14, v13
-; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v12, v[2:3]
-; SDAG-NEXT:    v_add_co_u32_e64 v5, s[4:5], v8, v5
-; SDAG-NEXT:    v_mul_lo_u32 v3, v6, v11
-; SDAG-NEXT:    v_mul_lo_u32 v7, v6, v12
-; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6]
-; SDAG-NEXT:    v_add3_u32 v3, v7, v2, v3
-; SDAG-NEXT:    ; implicit-def: $vgpr8
-; SDAG-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v1
-; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
-; SDAG-NEXT:    ; implicit-def: $vgpr5_vgpr6
-; SDAG-NEXT:    v_mov_b32_e32 v1, v4
-; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
-; SDAG-NEXT:  .LBB7_4: ; %Flow
-; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[12:13]
-; SDAG-NEXT:    s_cbranch_execz .LBB7_6
-; SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
-; SDAG-NEXT:    v_sub_u32_e32 v2, 0x86, v5
-; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[6:7]
-; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v2
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[4:5]
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
-; SDAG-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v8
-; SDAG-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; SDAG-NEXT:    v_mul_i32_i24_e32 v0, v0, v8
-; SDAG-NEXT:    v_mov_b32_e32 v3, v2
-; SDAG-NEXT:  .LBB7_6: ; %Flow1
-; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT:  .LBB7_7: ; %Flow2
-; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
-; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
-; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
-; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; SDAG-NEXT:    v_mov_b32_e32 v0, v2
-; SDAG-NEXT:    v_mov_b32_e32 v1, v2
-; SDAG-NEXT:  ; %bb.9: ; %Flow3
-; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SDAG-NEXT:  .LBB7_10: ; %fp-to-i-cleanup
-; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: fptoui_bf16_to_i128:
-; GISEL:       ; %bb.0: ; %fp-to-i-entry
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v4, v0
-; GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v4
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0
-; GISEL-NEXT:    v_lshrrev_b64 v[0:1], 7, v[5:6]
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f
-; GISEL-NEXT:    s_mov_b64 s[4:5], 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-NEXT:    v_bfe_u32 v5, v0, 0, 8
-; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[5:6], v[1:2]
-; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GISEL-NEXT:    v_mov_b32_e32 v3, s7
-; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
-; GISEL-NEXT:    s_cbranch_execz .LBB7_10
-; GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
-; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffff80
-; GISEL-NEXT:    v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v3, -1
-; GISEL-NEXT:    v_addc_co_u32_e64 v7, s[6:7], 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_cmp_le_u64_e32 vcc, -1, v[7:8]
-; GISEL-NEXT:    v_cmp_lt_i16_e64 s[4:5], -1, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, -1, v[7:8]
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GISEL-NEXT:    s_xor_b64 s[14:15], exec, s[6:7]
-; GISEL-NEXT:    s_cbranch_execz .LBB7_7
-; GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
-; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
-; GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v7, 3, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v8, 4, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v9, 5, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v10, 6, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v11, 7, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v12, 8, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v13, 9, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v14, 10, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v15, 11, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v16, 12, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v17, 13, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v18, 14, v0
-; GISEL-NEXT:    v_lshlrev_b16_e32 v19, 15, v0
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v7
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v7
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
-; GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
-; GISEL-NEXT:    v_and_b32_e32 v11, 0xffff, v0
-; GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v11
-; GISEL-NEXT:    v_or3_b32 v9, v1, v0, 1
-; GISEL-NEXT:    v_or3_b32 v10, v11, v0, 0
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0x86
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    v_and_b32_e32 v2, 0x7f, v4
-; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[5:6], v[0:1]
-; GISEL-NEXT:    v_or_b32_e32 v7, 0x80, v2
-; GISEL-NEXT:    v_mov_b32_e32 v8, 0
-; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
-; GISEL-NEXT:    s_cbranch_execz .LBB7_4
-; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff7a, v5
-; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[7:8]
-; GISEL-NEXT:    v_add_u32_e32 v4, 0xffffff3a, v5
-; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
-; GISEL-NEXT:    v_lshl_or_b32 v11, v11, 16, v11
-; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[7:8]
-; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v4, v[7:8]
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v8, v11, 0
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v10, v[0:1]
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v8, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v8, v10, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
-; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v10, v[4:5]
-; GISEL-NEXT:    ; implicit-def: $vgpr5
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
-; GISEL-NEXT:    ; implicit-def: $vgpr7_vgpr8
-; GISEL-NEXT:    ; implicit-def: $vgpr9
-; GISEL-NEXT:  .LBB7_4: ; %Flow
-; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[16:17]
-; GISEL-NEXT:    s_cbranch_execz .LBB7_6
-; GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
-; GISEL-NEXT:    v_sub_co_u32_e32 v2, vcc, 0x86, v5
-; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v2, v[7:8]
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
-; GISEL-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v9
-; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GISEL-NEXT:    v_mul_i32_i24_e32 v0, v0, v9
-; GISEL-NEXT:    v_mov_b32_e32 v3, v2
-; GISEL-NEXT:  .LBB7_6: ; %Flow1
-; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GISEL-NEXT:  .LBB7_7: ; %Flow2
-; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[14:15]
-; GISEL-NEXT:    s_cbranch_execz .LBB7_9
-; GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
-; GISEL-NEXT:    v_or_b32_e32 v2, v1, v2
-; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
-; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
-; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
-; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
-; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
-; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
-; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
-; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
-; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
-; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
-; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
-; GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v13, v14
-; GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
-; GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v15, v16
-; GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
-; GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 19, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v17, v18
-; GISEL-NEXT:    v_or3_b32 v0, v0, v19, v3
-; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 20, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 21, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v19, v3
-; GISEL-NEXT:    v_or3_b32 v0, v0, v4, v5
-; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 22, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 23, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v4, v5
-; GISEL-NEXT:    v_or3_b32 v0, v0, v6, v7
-; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 24, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 25, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v6, v7
-; GISEL-NEXT:    v_or3_b32 v0, v0, v8, v9
-; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 26, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 27, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v8, v9
-; GISEL-NEXT:    v_or3_b32 v0, v0, v10, v11
-; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 28, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 29, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v10, v11
-; GISEL-NEXT:    v_or3_b32 v0, v0, v12, v13
-; GISEL-NEXT:    v_lshlrev_b32_e32 v14, 30, v1
-; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
-; GISEL-NEXT:    v_or3_b32 v2, v2, v12, v13
-; GISEL-NEXT:    v_or3_b32 v0, v0, v14, v1
-; GISEL-NEXT:    v_or3_b32 v1, v2, v14, v1
-; GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
-; GISEL-NEXT:    v_mov_b32_e32 v2, v1
-; GISEL-NEXT:  .LBB7_9: ; %Flow3
-; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GISEL-NEXT:  .LBB7_10: ; %fp-to-i-cleanup
-; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fptoui_bf16_to_i128:
+; GCN:       ; %bb.0: ; %fp-to-i-entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v4, v0
+; GCN-NEXT:    v_bfe_u32 v5, v4, 7, 8
+; GCN-NEXT:    s_movk_i32 s4, 0x7e
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v5
+; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-NEXT:    s_cbranch_execz .LBB7_10
+; GCN-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
+; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; GCN-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
+; GCN-NEXT:    s_movk_i32 s6, 0xff7f
+; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
+; GCN-NEXT:    s_mov_b32 s7, -1
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[4:5], -1, v[2:3]
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
+; GCN-NEXT:    v_cmp_lt_i16_e32 vcc, -1, v4
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_xor_b64 s[10:11], exec, s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB7_7
+; GCN-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GCN-NEXT:    s_movk_i32 s4, 0x7f
+; GCN-NEXT:    v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GCN-NEXT:    s_mov_b64 s[4:5], 0x85
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
+; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_cndmask_b32_e64 v9, -1, 0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v8, -1, 1, vcc
+; GCN-NEXT:    v_or_b32_e32 v6, 0x80, v0
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
+; GCN-NEXT:    s_cbranch_execz .LBB7_4
+; GCN-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GCN-NEXT:    v_sub_u32_e32 v0, 0xc6, v5
+; GCN-NEXT:    v_add_u32_e32 v2, 0xffffff3a, v5
+; GCN-NEXT:    v_add_u32_e32 v4, 0xffffff7a, v5
+; GCN-NEXT:    v_lshrrev_b64 v[0:1], v0, v[6:7]
+; GCN-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
+; GCN-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[4:5]
+; GCN-NEXT:    v_lshlrev_b64 v[0:1], v4, v[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v12, 0, v0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v11, 0, v1, s[4:5]
+; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0
+; GCN-NEXT:    v_mul_lo_u32 v13, v9, v2
+; GCN-NEXT:    v_mul_lo_u32 v14, v8, v3
+; GCN-NEXT:    v_mov_b32_e32 v6, v1
+; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7]
+; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v8, v5
+; GCN-NEXT:    v_mov_b32_e32 v5, v7
+; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GCN-NEXT:    v_add_co_u32_e64 v6, s[4:5], -1, v10
+; GCN-NEXT:    v_add3_u32 v3, v3, v14, v13
+; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v12, v[2:3]
+; GCN-NEXT:    v_add_co_u32_e64 v5, s[4:5], v8, v5
+; GCN-NEXT:    v_mul_lo_u32 v3, v6, v11
+; GCN-NEXT:    v_mul_lo_u32 v7, v6, v12
+; GCN-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
+; GCN-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6]
+; GCN-NEXT:    v_add3_u32 v3, v7, v2, v3
+; GCN-NEXT:    ; implicit-def: $vgpr8
+; GCN-NEXT:    v_add_co_u32_e64 v2, s[4:5], v5, v1
+; GCN-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
+; GCN-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; GCN-NEXT:    v_mov_b32_e32 v1, v4
+; GCN-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GCN-NEXT:  .LBB7_4: ; %Flow
+; GCN-NEXT:    s_andn2_saveexec_b64 s[6:7], s[12:13]
+; GCN-NEXT:    s_cbranch_execz .LBB7_6
+; GCN-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GCN-NEXT:    v_sub_u32_e32 v2, 0x86, v5
+; GCN-NEXT:    v_lshrrev_b64 v[0:1], v2, v[6:7]
+; GCN-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GCN-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v8
+; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GCN-NEXT:    v_mul_i32_i24_e32 v0, v0, v8
+; GCN-NEXT:    v_mov_b32_e32 v3, v2
+; GCN-NEXT:  .LBB7_6: ; %Flow1
+; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-NEXT:  .LBB7_7: ; %Flow2
+; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GCN-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GCN-NEXT:    v_bfrev_b32_e32 v0, 1
+; GCN-NEXT:    v_bfrev_b32_e32 v1, -2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-NEXT:  ; %bb.9: ; %Flow3
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:  .LBB7_10: ; %fp-to-i-cleanup
+; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cvt = fptoui bfloat %x to i128
   ret i128 %cvt
 }
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
index acd48a64dea1f1..befe0d405307be 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
@@ -594,35 +594,35 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) {
 
 define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %b) {
   ; GISEL-GFX11-LABEL: name: amdgpu_cs_chain_cc_bfloat
-  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11: bb.0 (%ir-block.0):
   ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $vgpr8
   ; GISEL-GFX11-NEXT: {{  $}}
-  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; GISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; GISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY]]
-  ; GISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY1]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
-  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX11-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1
-  ; GISEL-GFX11-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
+  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; GISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY1]]
+  ; GISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY]]
+  ; GISEL-GFX11-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1
+  ; GISEL-GFX11-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; GISEL-GFX11-NEXT:   S_ENDPGM 0
   ;
   ; GISEL-GFX10-LABEL: name: amdgpu_cs_chain_cc_bfloat
-  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10: bb.0 (%ir-block.0):
   ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $vgpr8
   ; GISEL-GFX10-NEXT: {{  $}}
-  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; GISEL-GFX10-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; GISEL-GFX10-NEXT:   $vgpr0 = COPY [[COPY]]
-  ; GISEL-GFX10-NEXT:   $vgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
+  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
-  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX10-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GISEL-GFX10-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GISEL-GFX10-NEXT:   $vgpr0 = COPY [[COPY1]]
+  ; GISEL-GFX10-NEXT:   $vgpr1 = COPY [[COPY]]
+  ; GISEL-GFX10-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1
+  ; GISEL-GFX10-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; GISEL-GFX10-NEXT:   S_ENDPGM 0
   ;
   ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_cc_bfloat
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index 2e2a1094ba99ae..ef91f36d60373f 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
@@ -873,32 +873,6 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_half(half inre
 }
 
 define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %b) {
-  ; GISEL-GFX11-LABEL: name: amdgpu_cs_chain_cc_bfloat
-  ; GISEL-GFX11: bb.1 (%ir-block.0):
-  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $vgpr8
-  ; GISEL-GFX11-NEXT: {{  $}}
-  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; GISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GISEL-GFX11-NEXT:   [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; GISEL-GFX11-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; GISEL-GFX11-NEXT:   S_ENDPGM 0
-  ;
-  ; GISEL-GFX10-LABEL: name: amdgpu_cs_chain_cc_bfloat
-  ; GISEL-GFX10: bb.1 (%ir-block.0):
-  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $vgpr8
-  ; GISEL-GFX10-NEXT: {{  $}}
-  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; GISEL-GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GISEL-GFX10-NEXT:   [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; GISEL-GFX10-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; GISEL-GFX10-NEXT:   S_ENDPGM 0
-  ;
   ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat
   ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
   ; DAGISEL-GFX11-WF32-NEXT:   liveins: $sgpr0, $vgpr8
@@ -996,9 +970,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
   ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; GISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GISEL-GFX11-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec
+  ; GISEL-GFX11-NEXT:   [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec
   ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; GISEL-GFX11-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; GISEL-GFX11-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
   ; GISEL-GFX11-NEXT:   S_ENDPGM 0
   ;
   ; GISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
@@ -1020,10 +994,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
   ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
   ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
   ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
   ; DAGISEL-GFX11-WF32-NEXT:   S_ENDPGM 0
   ;
   ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
@@ -1032,10 +1006,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
   ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
   ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
   ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
   ; DAGISEL-GFX11-WF64-NEXT:   S_ENDPGM 0
   ;
   ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
index 046a72b9307d09..a0ba97d3b639ca 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-GISEL %s
 
 declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src0, <16 x float> %src1, float %scale)
 declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src0, <16 x float> %src1, float %scale)
@@ -983,85 +983,35 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) {
 }
 
 define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv(<6 x i32> %src, float %scale) {
-; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, v6
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, v5
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, v4
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, v1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, v0
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[16:31], v[0:5], v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v19
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v21
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v22
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v23
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, v16
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v17
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v18
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, v19
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, v21
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, v22
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, v23
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v22, v6
+; GCN-NEXT:    v_mov_b32_e32 v21, v5
+; GCN-NEXT:    v_mov_b32_e32 v20, v4
+; GCN-NEXT:    v_mov_b32_e32 v19, v3
+; GCN-NEXT:    v_mov_b32_e32 v18, v2
+; GCN-NEXT:    v_mov_b32_e32 v17, v1
+; GCN-NEXT:    v_mov_b32_e32 v16, v0
+; GCN-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float %scale)
   ret <32 x bfloat> %ret
 }
 
 define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
-; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s0
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s16
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s17
-; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    s_mov_b32 s4, s16
-; GFX950-GISEL-NEXT:    s_mov_b32 s5, s17
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[16:31], v[0:5], v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v19
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v21
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v22
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v23
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, v16
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v17
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v18
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, v19
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, v21
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, v22
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, v23
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v16, s0
+; GCN-NEXT:    v_mov_b32_e32 v17, s1
+; GCN-NEXT:    v_mov_b32_e32 v18, s2
+; GCN-NEXT:    v_mov_b32_e32 v19, s3
+; GCN-NEXT:    v_mov_b32_e32 v20, s16
+; GCN-NEXT:    v_mov_b32_e32 v21, s17
+; GCN-NEXT:    s_mov_b32 s0, 0x42c80000
+; GCN-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float 100.0)
   ret <32 x bfloat> %ret
 }
@@ -1126,85 +1076,35 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) {
 }
 
 define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv(<6 x i32> %src, float %scale) {
-; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, v6
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, v5
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, v4
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, v1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, v0
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[16:31], v[0:5], v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v19
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v21
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v22
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v23
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, v16
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v17
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v18
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, v19
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, v21
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, v22
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, v23
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v22, v6
+; GCN-NEXT:    v_mov_b32_e32 v21, v5
+; GCN-NEXT:    v_mov_b32_e32 v20, v4
+; GCN-NEXT:    v_mov_b32_e32 v19, v3
+; GCN-NEXT:    v_mov_b32_e32 v18, v2
+; GCN-NEXT:    v_mov_b32_e32 v17, v1
+; GCN-NEXT:    v_mov_b32_e32 v16, v0
+; GCN-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float %scale)
   ret <32 x bfloat> %ret
 }
 
 define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl(<6 x i32> inreg %src) {
-; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s0
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s16
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s17
-; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    s_mov_b32 s4, s16
-; GFX950-GISEL-NEXT:    s_mov_b32 s5, s17
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[16:31], v[0:5], v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v19
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v21
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v22
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v23
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, v16
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v17
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v18
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, v19
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, v21
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, v22
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, v23
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v16, s0
+; GCN-NEXT:    v_mov_b32_e32 v17, s1
+; GCN-NEXT:    v_mov_b32_e32 v18, s2
+; GCN-NEXT:    v_mov_b32_e32 v19, s3
+; GCN-NEXT:    v_mov_b32_e32 v20, s16
+; GCN-NEXT:    v_mov_b32_e32 v21, s17
+; GCN-NEXT:    s_mov_b32 s0, 0x42c80000
+; GCN-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float 100.0)
   ret <32 x bfloat> %ret
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
index f9fd7e253b1243..517c87193598d5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
 
 declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale)
 declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float %scale)
@@ -19,44 +19,11 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv(<32 x bfloat> %src, float
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, v17
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v25, v18
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v26, 16, v7
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v30, 16, v11
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v4, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v5, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v7, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v8, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v9, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v10, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v11, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v15, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, v17
 ; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[0:15], v16
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -90,82 +57,26 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_sl(<32 x bfloat> inreg %src,
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_sl:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_lshr_b32 s16, s0, 16
-; GFX950-GISEL-NEXT:    s_lshr_b32 s17, s1, 16
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s16, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s18, s2, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s0, s16, s0
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s17, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s19, s3, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s1, s16, s1
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s18, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s20, s4, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s2, s16, s2
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s19, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s21, s5, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s3, s16, s3
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s20, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s22, s6, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s4, s16, s4
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s21, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s23, s7, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s5, s16, s5
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s22, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s24, s8, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s6, s16, s6
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s23, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s25, s9, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s7, s16, s7
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s24, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s8, s8, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s26, s10, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s8, s16, s8
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s25, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s27, s11, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s9, s16, s9
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s26, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s28, s12, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s10, s16, s10
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s27, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s11, s11, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s29, s13, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s11, s16, s11
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s28, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s12, s12, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s30, s14, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s12, s16, s12
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s29, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s13, s13, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s31, s15, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s13, s16, s13
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s30, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s14, s14, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s14, s16, s14
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s31, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s15, s15, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s15, s16, s15
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[14:15]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[10:11]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[8:9]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], v24
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v5, s3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, s5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, s6
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v9, s7
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, s10
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v13, s11
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, s12
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v15, s13
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, s14
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v17, s15
+; GFX950-GISEL-NEXT:    s_mov_b32 s0, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], s0
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -252,44 +163,11 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_vv(<32 x bfloat> %src, float
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, v17
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v25, v18
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v26, 16, v7
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v30, 16, v11
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v4, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v5, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v7, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v8, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v9, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v10, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v11, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v15, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, v17
 ; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[0:15], v16
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -323,82 +201,26 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_sl(<32 x bfloat> inreg %src,
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_sl:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_lshr_b32 s16, s0, 16
-; GFX950-GISEL-NEXT:    s_lshr_b32 s17, s1, 16
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s16, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s18, s2, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s0, s16, s0
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s17, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s19, s3, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s1, s16, s1
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s18, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s20, s4, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s2, s16, s2
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s19, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s21, s5, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s3, s16, s3
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s20, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s22, s6, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s4, s16, s4
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s21, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s23, s7, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s5, s16, s5
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s22, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s24, s8, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s6, s16, s6
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s23, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s25, s9, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s7, s16, s7
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s24, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s8, s8, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s26, s10, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s8, s16, s8
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s25, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s27, s11, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s9, s16, s9
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s26, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s28, s12, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s10, s16, s10
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s27, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s11, s11, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s29, s13, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s11, s16, s11
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s28, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s12, s12, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s30, s14, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s12, s16, s12
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s29, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s13, s13, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s31, s15, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s13, s16, s13
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s30, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s14, s14, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s14, s16, s14
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s31, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s15, s15, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s15, s16, s15
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[14:15]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[10:11]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[8:9]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], v24
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v5, s3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, s5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, s6
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v9, s7
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, s10
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v13, s11
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, s12
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v15, s13
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, s14
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v17, s15
+; GFX950-GISEL-NEXT:    s_mov_b32 s0, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], s0
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll
index e1bf9f0daa1efa..d3851b1a084d68 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s
 
 declare i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 %old, bfloat %src, i32 %seed, float %scale, i32 %dst_sel)
 declare i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 %old, half %src, i32 %seed, float %scale, i32 %dst_sel)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll
index 1107b46f8f6d38..7433f6611cd9bd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950 %s
 
 declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 %dst_sel)
 declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 %dst_sel)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
index 0d4598f316c411..18b20e101a9383 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
 
 declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
 declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> %src, i32 %sr, float %scale)
@@ -19,42 +19,9 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_vv(<32 x bfloat> %src, i32
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_bf16_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v0
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v1
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v2
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v3
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v24, 16, v4
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v25, 16, v5
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v26, 16, v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v29, 16, v9
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v30, 16, v10
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v31, 16, v11
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v0, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v2, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v3, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v4, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v5, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v7, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v8, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v9, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v10, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v14, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v15, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    s_nop 0
 ; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_bf16 v[20:25], v[0:15], v16, v17
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[18:19], v[24:25], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -88,82 +55,26 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_sl(<32 x bfloat> inreg %sr
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_bf16_sl:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_lshr_b32 s17, s0, 16
-; GFX950-GISEL-NEXT:    s_lshr_b32 s18, s1, 16
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s17, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s19, s2, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s0, s17, s0
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s18, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s20, s3, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s1, s17, s1
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s19, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s21, s4, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s2, s17, s2
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s20, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s22, s5, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s3, s17, s3
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s21, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s23, s6, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s4, s17, s4
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s22, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s24, s7, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s5, s17, s5
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s23, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s25, s8, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s6, s17, s6
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s24, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s26, s9, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s7, s17, s7
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s25, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s8, s8, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s27, s10, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s8, s17, s8
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s26, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s28, s11, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s9, s17, s9
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s27, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s29, s12, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s10, s17, s10
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s28, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s11, s11, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s30, s13, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s11, s17, s11
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s29, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s12, s12, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s31, s14, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s12, s17, s12
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s30, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s13, s13, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s33, s15, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s13, s17, s13
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s31, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s14, s14, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s14, s17, s14
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s33, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s15, s15, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s15, s17, s15
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[14:15]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[10:11]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[8:9]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v5, s3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, s5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, s6
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v9, s7
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, s10
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v13, s11
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, s12
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v15, s13
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, s14
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v17, s15
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
 ; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_bf16 v[18:23], v[2:17], s16, v24
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -244,42 +155,9 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_vv(<32 x bfloat> %src, i32
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_bf16_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v0
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v1
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v2
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v3
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v24, 16, v4
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v25, 16, v5
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v26, 16, v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v29, 16, v9
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v30, 16, v10
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v31, 16, v11
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v0, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v2, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v3, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v4, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v5, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v7, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v8, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v9, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v10, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v14, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v15, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    s_nop 0
 ; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_bf16 v[20:25], v[0:15], v16, v17
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[18:19], v[24:25], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -313,82 +191,26 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_sl(<32 x bfloat> inreg %sr
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_bf16_sl:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_lshr_b32 s17, s0, 16
-; GFX950-GISEL-NEXT:    s_lshr_b32 s18, s1, 16
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s17, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s19, s2, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s0, s17, s0
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s18, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s20, s3, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s1, s17, s1
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s19, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s21, s4, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s2, s17, s2
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s20, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s22, s5, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s3, s17, s3
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s21, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s23, s6, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s4, s17, s4
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s22, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s24, s7, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s5, s17, s5
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s23, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s25, s8, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s6, s17, s6
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s24, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s26, s9, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s7, s17, s7
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s25, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s8, s8, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s27, s10, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s8, s17, s8
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s26, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s28, s11, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s9, s17, s9
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s27, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s29, s12, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s10, s17, s10
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s28, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s11, s11, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s30, s13, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s11, s17, s11
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s29, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s12, s12, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s31, s14, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s12, s17, s12
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s30, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s13, s13, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s33, s15, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s13, s17, s13
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s31, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s14, s14, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s14, s17, s14
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s33, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s15, s15, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s15, s17, s15
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[14:15]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[10:11]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[8:9]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v5, s3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, s5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, s6
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v9, s7
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, s10
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v13, s11
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, s12
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v15, s13
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, s14
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v17, s15
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
 ; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_bf16 v[18:23], v[2:17], s16, v24
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> %src, i32 %sr, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.ll
index 4a58d6346fc573..64a15bc1027596 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s
 
 declare <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half>, float, i32, i1)
 declare <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat>, float, i32, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
index f694d55f83b688..eb5bded6d26100 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s
 
 declare <2 x i32>    @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3))
 declare <2 x i32>    @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3))
@@ -146,11 +146,11 @@ define amdgpu_ps void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr add
 ;
 ; GFX950-GISEL-LABEL: ds_read_b64_tr_b16_v4bf16:
 ; GFX950-GISEL:       ; %bb.0: ; %entry
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-GISEL-NEXT:    ds_read_b64_tr_b16 v[0:1], v0 offset:32
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX950-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index 42acf089e86488..159592cab6a34a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950-ISEL
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950-ISEL
 
 declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 %clamp)
 
@@ -40,17 +40,17 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
 ; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
 ; GFX950-ISEL:       ; %bb.0: ; %entry
 ; GFX950-ISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-ISEL-NEXT:    s_load_dword s0, s[12:13], 0x0
 ; GFX950-ISEL-NEXT:    s_load_dword s1, s[14:15], 0x0
 ; GFX950-ISEL-NEXT:    s_load_dword s2, s[10:11], 0x0
 ; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-ISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX950-ISEL-NEXT:    v_dot2_f32_bf16 v0, s2, v0, v1 clamp
-; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-ISEL-NEXT:    s_nop 1
-; GFX950-ISEL-NEXT:    global_store_dword v1, v0, s[8:9]
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX950-ISEL-NEXT:    v_dot2_f32_bf16 v1, s2, v1, v2 clamp
+; GFX950-ISEL-NEXT:    s_nop 2
+; GFX950-ISEL-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX950-ISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -100,17 +100,17 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
 ; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
 ; GFX950-ISEL:       ; %bb.0: ; %entry
 ; GFX950-ISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-ISEL-NEXT:    s_load_dword s0, s[12:13], 0x0
 ; GFX950-ISEL-NEXT:    s_load_dword s1, s[14:15], 0x0
 ; GFX950-ISEL-NEXT:    s_load_dword s2, s[10:11], 0x0
 ; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-ISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX950-ISEL-NEXT:    v_dot2c_f32_bf16_e32 v1, s2, v0
-; GFX950-ISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-ISEL-NEXT:    s_nop 1
-; GFX950-ISEL-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX950-ISEL-NEXT:    v_dot2c_f32_bf16_e32 v2, s2, v1
+; GFX950-ISEL-NEXT:    s_nop 2
+; GFX950-ISEL-NEXT:    global_store_dword v0, v2, s[8:9]
 ; GFX950-ISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index 8427b4e7f6f35a..537aab9a3e9c5a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
 
 declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1))
 declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index be4fa79951daff..4db256de1ce1b5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
 
 declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1))
 declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
index 12f9029392a431..7be0d9ca329aaa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -global-isel=0 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn -global-isel=1 -global-isel-abort=2 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-GISEL %s
 
 define amdgpu_ps float @atomic_pk_add_f16_1d_v2(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) {
 ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2:
@@ -156,16 +156,6 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x bfl
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, 0
@@ -190,16 +180,6 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_noret(<8 x i32> inreg %rsrc, <4
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_noret:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -219,16 +199,6 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_nt(<8 x i32> inreg %rsrc, <4 x
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_nt:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 722c53a9dd607e..d9ee276c3f076e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
 
 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
@@ -1856,198 +1856,92 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
 
 define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_f32_16x16x32_bf16:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_f32_16x16x32_bf16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GISEL-NEXT:    v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_f32_16x16x32_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_f32_16x16x32_bf16__flags:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_f32_16x16x32_bf16__flags:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GISEL-NEXT:    v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
   ret <4 x float> %result
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
-; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT:    v_mov_b32_e32 v8, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
-; SDAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; GCN-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
   store <4 x float> %result, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
-; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT:    v_mov_b32_e32 v8, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
-; SDAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; GCN-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
   store <4 x float> %result, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
index 53e37479f68e63..481e721e3c21dc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
 
 ; GFX10PLUS-LABEL: {{^}}dpp8_test:
 ; GFX10PLUS: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 71961a57bd080d..5eb6d203098ee1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
 ; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
 ; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1)
 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll
index f8caf84d5c51a2..09cc55b53539b8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll
@@ -15,14 +15,12 @@
 ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-f32-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-F32-GISEL %s
 ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2f16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2F16-GISEL %s
 ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2f16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2F16-GISEL %s
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
-
-; FIXME: These should fail when bfloat support is handled correctly
-; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
-; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
-; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
-; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
+; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
+; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
+; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
+; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
+; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
+; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
 
 ; Make sure buffer fadd atomics with return values are not selected
 ; for gfx908 where they do not work.
@@ -66,7 +64,7 @@ define <2 x half> @struct_ptr_buffer_atomic_fadd_v2f16_rtn(<2 x half> %val, ptr
 
 ;--- raw-ret-v2bf16-error.ll
 ; ERR-RAW-V2BF16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD
-; ERR-RAW-V2BF16-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(<2 x s16>) = G_AMDGPU_BUFFER_ATOMIC_FADD
+; ERR-RAW-V2BF16-GISEL: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD
 
 define <2 x bfloat> @raw_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
   %ret = call <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
@@ -75,7 +73,7 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, <4
 
 ;--- struct-ret-v2bf16-error.ll
 ; ERR-STRUCT-V2BF16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD
-; ERR-STRUCT-V2BF16-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(<2 x s16>) = G_AMDGPU_BUFFER_ATOMIC_FADD
+; ERR-STRUCT-V2BF16-GISEL: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD
 
 define <2 x bfloat> @struct_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
   %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 9a2f0aa5adb772..0605a158b974fc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
 
 define void @test_readfirstlane_i1(ptr addrspace(1) %out, i1 %src) {
 ; CHECK-SDAG-LABEL: test_readfirstlane_i1:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index a8560ff1aa2b0c..edb6ebcee13255 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s
 
 declare i32 @llvm.amdgcn.readlane.i32(i32, i32) #0
 declare i64 @llvm.amdgcn.readlane.i64(i64, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 0a330e91f82068..66c02a9bd0c6a5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
@@ -628,57 +628,31 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat>, <16 x bfloat>, <4 x float>, i32, i32 immarg, i32 immarg)
 
 define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) %arg, <8 x bfloat> %a, <16 x bfloat> %b, i32 %idx) #0 {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_bf16__vgpr:
-; SDAG:       ; %bb.0: ; %bb
-; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
-; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
-; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_mov_b32_e32 v17, s16
-; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
-; SDAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_bf16__vgpr:
-; GISEL:       ; %bb.0: ; %bb
-; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
-; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_mov_b32_e32 v16, s16
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
-; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[6:7]
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__vgpr:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GCN-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
+; GCN-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT:    v_mov_b32_e32 v17, s16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
+; GCN-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
@@ -689,266 +663,94 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_bf16:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_bf16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
-; GISEL-NEXT:    v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
-; GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
-; GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v7
-; GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v8
-; GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
-; GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v10
-; GISEL-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
-; GISEL-NEXT:    v_mov_b32_sdwa v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v7, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v8, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v9, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v10, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v11, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x64_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
-; GISEL-NEXT:    v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
-; GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
-; GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v7
-; GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v8
-; GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
-; GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v10
-; GISEL-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
-; GISEL-NEXT:    v_mov_b32_sdwa v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v7, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v8, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v9, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v10, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v11, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
-; GISEL-NEXT:    v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
-; GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
-; GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v7
-; GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v8
-; GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
-; GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v10
-; GISEL-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
-; GISEL-NEXT:    v_mov_b32_sdwa v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v7, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v8, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v9, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v10, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v11, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v8, s0
-; SDAG-NEXT:    v_mov_b32_e32 v9, s1
-; SDAG-NEXT:    v_mov_b32_e32 v10, s2
-; SDAG-NEXT:    v_mov_b32_e32 v11, s3
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v12, s28
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_lshr_b32 s4, s0, 16
-; GISEL-NEXT:    s_lshr_b32 s5, s1, 16
-; GISEL-NEXT:    s_lshl_b32 s4, s4, 16
-; GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s6, s2, 16
-; GISEL-NEXT:    s_or_b32 s0, s4, s0
-; GISEL-NEXT:    s_lshl_b32 s4, s5, 16
-; GISEL-NEXT:    s_and_b32 s1, s1, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s7, s3, 16
-; GISEL-NEXT:    s_or_b32 s1, s4, s1
-; GISEL-NEXT:    s_lshl_b32 s4, s6, 16
-; GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
-; GISEL-NEXT:    s_or_b32 s2, s4, s2
-; GISEL-NEXT:    s_lshl_b32 s4, s7, 16
-; GISEL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GISEL-NEXT:    s_or_b32 s3, s4, s3
-; GISEL-NEXT:    s_lshr_b32 s4, s16, 16
-; GISEL-NEXT:    s_lshr_b32 s5, s17, 16
-; GISEL-NEXT:    s_lshl_b32 s4, s4, 16
-; GISEL-NEXT:    s_and_b32 s12, s16, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s6, s18, 16
-; GISEL-NEXT:    s_or_b32 s4, s4, s12
-; GISEL-NEXT:    s_lshl_b32 s5, s5, 16
-; GISEL-NEXT:    s_and_b32 s12, s17, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s7, s19, 16
-; GISEL-NEXT:    s_or_b32 s5, s5, s12
-; GISEL-NEXT:    s_lshl_b32 s6, s6, 16
-; GISEL-NEXT:    s_and_b32 s12, s18, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s8, s20, 16
-; GISEL-NEXT:    s_or_b32 s6, s6, s12
-; GISEL-NEXT:    s_lshl_b32 s7, s7, 16
-; GISEL-NEXT:    s_and_b32 s12, s19, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s9, s21, 16
-; GISEL-NEXT:    s_or_b32 s7, s7, s12
-; GISEL-NEXT:    s_lshl_b32 s8, s8, 16
-; GISEL-NEXT:    s_and_b32 s12, s20, 0xffff
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
-; GISEL-NEXT:    s_lshr_b32 s10, s22, 16
-; GISEL-NEXT:    s_or_b32 s8, s8, s12
-; GISEL-NEXT:    s_lshl_b32 s9, s9, 16
-; GISEL-NEXT:    s_and_b32 s12, s21, 0xffff
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
-; GISEL-NEXT:    s_lshr_b32 s11, s23, 16
-; GISEL-NEXT:    s_or_b32 s9, s9, s12
-; GISEL-NEXT:    s_lshl_b32 s10, s10, 16
-; GISEL-NEXT:    s_and_b32 s12, s22, 0xffff
-; GISEL-NEXT:    s_or_b32 s10, s10, s12
-; GISEL-NEXT:    s_lshl_b32 s11, s11, 16
-; GISEL-NEXT:    s_and_b32 s12, s23, 0xffff
-; GISEL-NEXT:    s_or_b32 s11, s11, s12
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT:    v_mov_b32_e32 v16, s28
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_bf16 v[0:3], v[12:15], v[4:11], v16
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v8, s0
+; GCN-NEXT:    v_mov_b32_e32 v9, s1
+; GCN-NEXT:    v_mov_b32_e32 v10, s2
+; GCN-NEXT:    v_mov_b32_e32 v11, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NEXT:    v_mov_b32_e32 v2, s18
+; GCN-NEXT:    v_mov_b32_e32 v3, s19
+; GCN-NEXT:    v_mov_b32_e32 v4, s20
+; GCN-NEXT:    v_mov_b32_e32 v5, s21
+; GCN-NEXT:    v_mov_b32_e32 v6, s22
+; GCN-NEXT:    v_mov_b32_e32 v7, s23
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s24
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s25
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s26
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s27
+; GCN-NEXT:    v_mov_b32_e32 v12, s28
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
@@ -960,71 +762,38 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0
 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat>, <16 x bfloat>, <16 x float>, i32, i32 immarg, i32 immarg)
 
 define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) %arg, <8 x bfloat> %a, <16 x bfloat> %b, i32 %idx) #0 {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
-; SDAG:       ; %bb.0: ; %bb
-; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
-; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
-; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
-; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[2:3]
-; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[0:1]
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
-; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
-; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
-; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
-; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
-; SDAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
-; GISEL:       ; %bb.0: ; %bb
-; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
-; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[0:1]
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT:    v_mov_b32_e32 v28, s16
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
-; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
-; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
+; GCN-NEXT:    v_mov_b64_e32 v[26:27], s[2:3]
+; GCN-NEXT:    v_mov_b64_e32 v[24:25], s[0:1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; GCN-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; GCN-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v28, s16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; GCN-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
@@ -1035,448 +804,209 @@ bb:
 }
 
 define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v49
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v50
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v51
-; GISEL-NEXT:    v_mov_b32_sdwa v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v31
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v32
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v33
-; GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v34
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v35
-; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v36
-; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v37
-; GISEL-NEXT:    v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v35, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v36, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v37, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[48:51], v[30:37], v28
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x32_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v49
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v50
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v51
-; GISEL-NEXT:    v_mov_b32_sdwa v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v31
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v32
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v33
-; GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v34
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v35
-; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v36
-; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v37
-; GISEL-NEXT:    v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v35, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v36, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v37, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v49
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v50
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v51
-; GISEL-NEXT:    v_mov_b32_sdwa v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v31
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v32
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v33
-; GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v34
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v35
-; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v36
-; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v37
-; GISEL-NEXT:    v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v35, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v36, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v37, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v28, s0
-; SDAG-NEXT:    v_mov_b32_e32 v29, s1
-; SDAG-NEXT:    v_mov_b32_e32 v30, s2
-; SDAG-NEXT:    v_mov_b32_e32 v31, s3
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_lshr_b32 s4, s0, 16
-; GISEL-NEXT:    s_lshr_b32 s5, s1, 16
-; GISEL-NEXT:    s_lshl_b32 s4, s4, 16
-; GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s6, s2, 16
-; GISEL-NEXT:    s_or_b32 s8, s4, s0
-; GISEL-NEXT:    s_lshl_b32 s0, s5, 16
-; GISEL-NEXT:    s_and_b32 s1, s1, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s7, s3, 16
-; GISEL-NEXT:    s_or_b32 s9, s0, s1
-; GISEL-NEXT:    s_lshl_b32 s0, s6, 16
-; GISEL-NEXT:    s_and_b32 s1, s2, 0xffff
-; GISEL-NEXT:    s_or_b32 s10, s0, s1
-; GISEL-NEXT:    s_lshl_b32 s0, s7, 16
-; GISEL-NEXT:    s_and_b32 s1, s3, 0xffff
-; GISEL-NEXT:    s_or_b32 s11, s0, s1
-; GISEL-NEXT:    s_lshr_b32 s0, s16, 16
-; GISEL-NEXT:    s_lshr_b32 s1, s17, 16
-; GISEL-NEXT:    s_lshl_b32 s0, s0, 16
-; GISEL-NEXT:    s_and_b32 s12, s16, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s2, s18, 16
-; GISEL-NEXT:    s_or_b32 s0, s0, s12
-; GISEL-NEXT:    s_lshl_b32 s1, s1, 16
-; GISEL-NEXT:    s_and_b32 s12, s17, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s3, s19, 16
-; GISEL-NEXT:    s_or_b32 s1, s1, s12
-; GISEL-NEXT:    s_lshl_b32 s2, s2, 16
-; GISEL-NEXT:    s_and_b32 s12, s18, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s4, s20, 16
-; GISEL-NEXT:    s_or_b32 s2, s2, s12
-; GISEL-NEXT:    s_lshl_b32 s3, s3, 16
-; GISEL-NEXT:    s_and_b32 s12, s19, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s5, s21, 16
-; GISEL-NEXT:    s_or_b32 s3, s3, s12
-; GISEL-NEXT:    s_lshl_b32 s4, s4, 16
-; GISEL-NEXT:    s_and_b32 s12, s20, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s6, s22, 16
-; GISEL-NEXT:    s_or_b32 s4, s4, s12
-; GISEL-NEXT:    s_lshl_b32 s5, s5, 16
-; GISEL-NEXT:    s_and_b32 s12, s21, 0xffff
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[10:11]
-; GISEL-NEXT:    s_lshr_b32 s7, s23, 16
-; GISEL-NEXT:    s_or_b32 s5, s5, s12
-; GISEL-NEXT:    s_lshl_b32 s6, s6, 16
-; GISEL-NEXT:    s_and_b32 s12, s22, 0xffff
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[8:9]
-; GISEL-NEXT:    s_or_b32 s6, s6, s12
-; GISEL-NEXT:    s_lshl_b32 s7, s7, 16
-; GISEL-NEXT:    s_and_b32 s12, s23, 0xffff
-; GISEL-NEXT:    s_or_b32 s7, s7, s12
-; GISEL-NEXT:    v_mov_b32_e32 v18, s24
-; GISEL-NEXT:    v_mov_b32_e32 v19, s25
-; GISEL-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL-NEXT:    v_mov_b32_e32 v16, v10
-; GISEL-NEXT:    v_mov_b32_e32 v20, s26
-; GISEL-NEXT:    v_mov_b32_e32 v21, s27
-; GISEL-NEXT:    v_mov_b32_e32 v22, s28
-; GISEL-NEXT:    v_mov_b32_e32 v23, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[6:7]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[4:5]
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[0:1]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[34:37], v[48:55], v16
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v28, s0
+; GCN-NEXT:    v_mov_b32_e32 v29, s1
+; GCN-NEXT:    v_mov_b32_e32 v30, s2
+; GCN-NEXT:    v_mov_b32_e32 v31, s3
+; GCN-NEXT:    v_mov_b32_e32 v12, s24
+; GCN-NEXT:    v_mov_b32_e32 v27, v9
+; GCN-NEXT:    v_mov_b32_e32 v26, v8
+; GCN-NEXT:    v_mov_b32_e32 v25, v7
+; GCN-NEXT:    v_mov_b32_e32 v24, v6
+; GCN-NEXT:    v_mov_b32_e32 v23, v5
+; GCN-NEXT:    v_mov_b32_e32 v22, v4
+; GCN-NEXT:    v_mov_b32_e32 v21, v3
+; GCN-NEXT:    v_mov_b32_e32 v20, v2
+; GCN-NEXT:    v_mov_b32_e32 v19, v1
+; GCN-NEXT:    v_mov_b32_e32 v18, v0
+; GCN-NEXT:    v_mov_b32_e32 v13, s25
+; GCN-NEXT:    v_mov_b32_e32 v14, s26
+; GCN-NEXT:    v_mov_b32_e32 v15, s27
+; GCN-NEXT:    v_mov_b32_e32 v16, s28
+; GCN-NEXT:    v_mov_b32_e32 v17, s29
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NEXT:    v_mov_b32_e32 v2, s18
+; GCN-NEXT:    v_mov_b32_e32 v3, s19
+; GCN-NEXT:    v_mov_b32_e32 v4, s20
+; GCN-NEXT:    v_mov_b32_e32 v5, s21
+; GCN-NEXT:    v_mov_b32_e32 v6, s22
+; GCN-NEXT:    v_mov_b32_e32 v7, s23
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
 }
@@ -4627,5 +4157,3 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 }
 
 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll
index 0ca96d5a1eb19c..fa32ee108d3829 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
 
 ; DPP control value 337 is valid for 64-bit DPP on gfx942
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index cbc76a32a75e44..7342c366799e9c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -3,9 +3,9 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
 
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX802-GISEL %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX1010-GISEL %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 -global-isel < %s | FileCheck -check-prefixes=GFX1100-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX802-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX1010-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX1100-GISEL %s
 
 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
 declare i64 @llvm.amdgcn.writelane.i64(i64, i32, i64) #0
@@ -2128,10 +2128,10 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    flat_load_ushort v4, v[0:1]
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 m0, v3
 ; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_nop 1
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v4, s4, m0
 ; GFX802-GISEL-NEXT:    flat_store_short v[0:1], v4
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-ret.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-ret.ll
index 8e56942309ae89..ac4b5feb29594b 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-ret.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-ret.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfbfmin,+zvfh -global-isel -stop-after=irtranslator \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfbfmin,+zvfh -global-isel -global-isel-abort=2 -stop-after=irtranslator \
 ; RUN:   -verify-machineinstrs < %s | FileCheck -check-prefixes=RV32 %s
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfbfmin,+zvfh -global-isel -stop-after=irtranslator \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfbfmin,+zvfh -global-isel -global-isel-abort=2 -stop-after=irtranslator \
 ; RUN:   -verify-machineinstrs < %s | FileCheck -check-prefixes=RV64 %s
 
 ; ==========================================================================
@@ -714,96 +714,96 @@ entry:
 
 define <vscale x 1 x bfloat> @test_ret_nxv1b16() {
   ; RV32-LABEL: name: test_ret_nxv1b16
-  ; RV32: bb.1.entry:
-  ; RV32-NEXT:   [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
-  ; RV32-NEXT:   $v8 = COPY [[DEF]](<vscale x 1 x s16>)
-  ; RV32-NEXT:   PseudoRET implicit $v8
+  ; RV32: bb.0:
+  ; RV32-NEXT:   successors: %bb.1(0x80000000)
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT: bb.1.entry:
   ;
   ; RV64-LABEL: name: test_ret_nxv1b16
-  ; RV64: bb.1.entry:
-  ; RV64-NEXT:   [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
-  ; RV64-NEXT:   $v8 = COPY [[DEF]](<vscale x 1 x s16>)
-  ; RV64-NEXT:   PseudoRET implicit $v8
+  ; RV64: bb.0:
+  ; RV64-NEXT:   successors: %bb.1(0x80000000)
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT: bb.1.entry:
 entry:
   ret <vscale x 1 x bfloat> undef
 }
 
 define <vscale x 2 x bfloat> @test_ret_nxv2b16() {
   ; RV32-LABEL: name: test_ret_nxv2b16
-  ; RV32: bb.1.entry:
-  ; RV32-NEXT:   [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
-  ; RV32-NEXT:   $v8 = COPY [[DEF]](<vscale x 2 x s16>)
-  ; RV32-NEXT:   PseudoRET implicit $v8
+  ; RV32: bb.0:
+  ; RV32-NEXT:   successors: %bb.1(0x80000000)
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT: bb.1.entry:
   ;
   ; RV64-LABEL: name: test_ret_nxv2b16
-  ; RV64: bb.1.entry:
-  ; RV64-NEXT:   [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
-  ; RV64-NEXT:   $v8 = COPY [[DEF]](<vscale x 2 x s16>)
-  ; RV64-NEXT:   PseudoRET implicit $v8
+  ; RV64: bb.0:
+  ; RV64-NEXT:   successors: %bb.1(0x80000000)
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT: bb.1.entry:
 entry:
   ret <vscale x 2 x bfloat> undef
 }
 
 define <vscale x 4 x bfloat> @test_ret_nxv4b16() {
   ; RV32-LABEL: name: test_ret_nxv4b16
-  ; RV32: bb.1.entry:
-  ; RV32-NEXT:   [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
-  ; RV32-NEXT:   $v8 = COPY [[DEF]](<vscale x 4 x s16>)
-  ; RV32-NEXT:   PseudoRET implicit $v8
+  ; RV32: bb.0:
+  ; RV32-NEXT:   successors: %bb.1(0x80000000)
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT: bb.1.entry:
   ;
   ; RV64-LABEL: name: test_ret_nxv4b16
-  ; RV64: bb.1.entry:
-  ; RV64-NEXT:   [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
-  ; RV64-NEXT:   $v8 = COPY [[DEF]](<vscale x 4 x s16>)
-  ; RV64-NEXT:   PseudoRET implicit $v8
+  ; RV64: bb.0:
+  ; RV64-NEXT:   successors: %bb.1(0x80000000)
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT: bb.1.entry:
 entry:
   ret <vscale x 4 x bfloat> undef
 }
 
 define <vscale x 8 x bfloat> @test_ret_nxv8b16() {
   ; RV32-LABEL: name: test_ret_nxv8b16
-  ; RV32: bb.1.entry:
-  ; RV32-NEXT:   [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
-  ; RV32-NEXT:   $v8m2 = COPY [[DEF]](<vscale x 8 x s16>)
-  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ; RV32: bb.0:
+  ; RV32-NEXT:   successors: %bb.1(0x80000000)
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT: bb.1.entry:
   ;
   ; RV64-LABEL: name: test_ret_nxv8b16
-  ; RV64: bb.1.entry:
-  ; RV64-NEXT:   [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
-  ; RV64-NEXT:   $v8m2 = COPY [[DEF]](<vscale x 8 x s16>)
-  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  ; RV64: bb.0:
+  ; RV64-NEXT:   successors: %bb.1(0x80000000)
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT: bb.1.entry:
 entry:
   ret <vscale x 8 x bfloat> undef
 }
 
 define <vscale x 16 x bfloat> @test_ret_nxv16b16() {
   ; RV32-LABEL: name: test_ret_nxv16b16
-  ; RV32: bb.1.entry:
-  ; RV32-NEXT:   [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
-  ; RV32-NEXT:   $v8m4 = COPY [[DEF]](<vscale x 16 x s16>)
-  ; RV32-NEXT:   PseudoRET implicit $v8m4
+  ; RV32: bb.0:
+  ; RV32-NEXT:   successors: %bb.1(0x80000000)
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT: bb.1.entry:
   ;
   ; RV64-LABEL: name: test_ret_nxv16b16
-  ; RV64: bb.1.entry:
-  ; RV64-NEXT:   [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
-  ; RV64-NEXT:   $v8m4 = COPY [[DEF]](<vscale x 16 x s16>)
-  ; RV64-NEXT:   PseudoRET implicit $v8m4
+  ; RV64: bb.0:
+  ; RV64-NEXT:   successors: %bb.1(0x80000000)
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT: bb.1.entry:
 entry:
   ret <vscale x 16 x bfloat> undef
 }
 
 define <vscale x 32 x bfloat> @test_ret_nxv32b16() {
   ; RV32-LABEL: name: test_ret_nxv32b16
-  ; RV32: bb.1.entry:
-  ; RV32-NEXT:   [[DEF:%[0-9]+]]:_(<vscale x 32 x s16>) = G_IMPLICIT_DEF
-  ; RV32-NEXT:   $v8m8 = COPY [[DEF]](<vscale x 32 x s16>)
-  ; RV32-NEXT:   PseudoRET implicit $v8m8
+  ; RV32: bb.0:
+  ; RV32-NEXT:   successors: %bb.1(0x80000000)
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT: bb.1.entry:
   ;
   ; RV64-LABEL: name: test_ret_nxv32b16
-  ; RV64: bb.1.entry:
-  ; RV64-NEXT:   [[DEF:%[0-9]+]]:_(<vscale x 32 x s16>) = G_IMPLICIT_DEF
-  ; RV64-NEXT:   $v8m8 = COPY [[DEF]](<vscale x 32 x s16>)
-  ; RV64-NEXT:   PseudoRET implicit $v8m8
+  ; RV64: bb.0:
+  ; RV64-NEXT:   successors: %bb.1(0x80000000)
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT: bb.1.entry:
 entry:
   ret <vscale x 32 x bfloat> undef
 }
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index 5466d315c05a49..2969dd9156ccbf 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -2346,6 +2346,20 @@ void GlobalISelEmitter::emitRunCustomAction(raw_ostream &OS) {
      << "}\n";
 }
 
+bool hasBFloatType(const TreePatternNode &Node) {
+  for (unsigned I = 0, E = Node.getNumTypes(); I < E; I++) {
+    auto Ty = Node.getType(I);
+    for (auto T : Ty)
+      if (T.second == MVT::bf16 ||
+          (T.second.isVector() && T.second.getScalarType() == MVT::bf16))
+        return true;
+  }
+  for (const TreePatternNode &C : Node.children())
+    if (hasBFloatType(C))
+      return true;
+  return false;
+}
+
 void GlobalISelEmitter::run(raw_ostream &OS) {
   if (!UseCoverageFile.empty()) {
     RuleCoverage = CodeGenCoverage();
@@ -2382,6 +2396,13 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
 
     if (Pat.getGISelShouldIgnore())
       continue; // skip without warning
+
+    // Skip any patterns containing BF16 types, as GISel cannot currently tell
+    // the difference between fp16 and bf16. FIXME: This can be removed once
+    // BF16 is supported properly.
+    if (hasBFloatType(Pat.getSrcPattern()))
+      continue;
+
     auto MatcherOrErr = runOnPattern(Pat);
 
     // The pattern analysis can fail, indicating an unsupported pattern.



More information about the llvm-commits mailing list