[llvm] [AMDGPU][True16][CodeGen] add a d16 predicate for true16 mode (PR #156574)

Brox Chen via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 8 10:34:30 PDT 2025


https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/156574

>From 90f721f5d5521a16ad36452c2909cad805d56af9 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 8 Sep 2025 13:33:54 -0400
Subject: [PATCH] add d16-hw-bug flag

---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |    17 +-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |     4 +
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |     3 +
 llvm/lib/Target/AMDGPU/FLATInstructions.td    |   137 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll  | 27542 ++++++----------
 .../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll   |    56 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll   |   586 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll   | 10778 +++---
 llvm/test/CodeGen/AMDGPU/bf16.ll              |   217 +-
 .../branch-relaxation-inst-size-gfx11.ll      |    10 +-
 .../CodeGen/AMDGPU/call-argument-types.ll     |     2 +-
 llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll    |   193 +-
 llvm/test/CodeGen/AMDGPU/clamp-modifier.ll    |    24 +-
 llvm/test/CodeGen/AMDGPU/clamp.ll             |     6 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll |   309 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.ll     |     4 +-
 llvm/test/CodeGen/AMDGPU/fdiv.f16.ll          |    82 +-
 .../test/CodeGen/AMDGPU/flat-address-space.ll |     4 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll      |     8 +-
 llvm/test/CodeGen/AMDGPU/flat_atomics.ll      |    20 +-
 llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll |    15 +-
 llvm/test/CodeGen/AMDGPU/fmaximum.ll          |    54 +-
 llvm/test/CodeGen/AMDGPU/fmed3.ll             |   100 +-
 llvm/test/CodeGen/AMDGPU/fminimum.ll          |    54 +-
 llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll       |   588 +-
 llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll    |     2 +-
 llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll     |     2 +-
 llvm/test/CodeGen/AMDGPU/fneg.bf16.ll         |    69 +-
 llvm/test/CodeGen/AMDGPU/fneg.f16.ll          |     4 +-
 .../CodeGen/AMDGPU/frame-index-elimination.ll |     2 +-
 llvm/test/CodeGen/AMDGPU/freeze.ll            |   146 +-
 llvm/test/CodeGen/AMDGPU/frem.ll              |   324 +-
 llvm/test/CodeGen/AMDGPU/function-args.ll     |   655 +-
 .../AMDGPU/gfx-callable-argument-types.ll     |   644 +-
 .../AMDGPU/gfx11-user-sgpr-init16-bug.ll      |     8 +-
 .../AMDGPU/global-extload-gfx11plus.ll        |   140 +-
 llvm/test/CodeGen/AMDGPU/global-saddr-load.ll |   171 +-
 llvm/test/CodeGen/AMDGPU/global_atomics.ll    |    98 +-
 llvm/test/CodeGen/AMDGPU/half.ll              |    16 +-
 llvm/test/CodeGen/AMDGPU/icmp.i16.ll          |   300 +-
 llvm/test/CodeGen/AMDGPU/idot4s.ll            |    29 +-
 llvm/test/CodeGen/AMDGPU/idot4u.ll            |   173 +-
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll     |    12 +-
 .../AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll       |    65 +-
 llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll      |     2 +-
 llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll      |     2 +-
 llvm/test/CodeGen/AMDGPU/mad.u16.ll           |    14 +-
 llvm/test/CodeGen/AMDGPU/min.ll               |    73 +-
 llvm/test/CodeGen/AMDGPU/offset-split-flat.ll |  1852 +-
 .../CodeGen/AMDGPU/offset-split-global.ll     |  1844 +-
 .../AMDGPU/promote-constOffset-to-imm.ll      |     2 +-
 llvm/test/CodeGen/AMDGPU/rotl.ll              |    12 +-
 llvm/test/CodeGen/AMDGPU/rotr.ll              |    12 +-
 .../CodeGen/AMDGPU/shrink-add-sub-constant.ll |   198 +-
 llvm/test/CodeGen/AMDGPU/smed3.ll             |     2 +-
 llvm/test/CodeGen/AMDGPU/spillv16.ll          |   150 +-
 llvm/test/CodeGen/AMDGPU/strict_fpext.ll      |     2 +-
 llvm/test/CodeGen/AMDGPU/uaddo.ll             |     2 +-
 llvm/test/CodeGen/AMDGPU/umed3.ll             |     2 +-
 llvm/test/CodeGen/AMDGPU/usubo.ll             |     2 +-
 llvm/test/CodeGen/AMDGPU/v_cndmask.ll         |    14 +-
 llvm/test/CodeGen/AMDGPU/v_pack.ll            |    48 +-
 .../test/CodeGen/AMDGPU/vector_rebroadcast.ll |     2 +-
 .../CodeGen/AMDGPU/vector_shuffle.packed.ll   |    25 +-
 llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll  |     2 +-
 65 files changed, 18800 insertions(+), 29135 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 0e0b84f7e3374..6f769b553271d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -583,6 +583,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
   "Use true 16-bit registers"
 >;
 
+def Feature16bitD16HWBug : SubtargetFeature<"d16-hw-bug",
+  "Enable16bitD16HWBug",
+  "true",
+  "Disable D16 for 16 bit data type for true16 mode"
+>;
+
 def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
   "HasBF16TransInsts",
   "true",
@@ -1928,7 +1934,9 @@ def FeatureISAVersion11_Common : FeatureSet<
    FeaturePackedTID,
    FeatureVcmpxPermlaneHazard,
    FeatureMemoryAtomicFAddF32DenormalSupport,
-   FeatureRealTrue16Insts]>;
+   FeatureRealTrue16Insts,
+   Feature16bitD16HWBug,
+]>;
 
 // There are few workarounds that need to be
 // added to all targets. This pessimizes codegen
@@ -2564,6 +2572,13 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
   // FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
   // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
 
+// Use D16 Insts in true16 mode
+def Has16bitD16HWBug: Predicate<"Subtarget->has16bitD16HWBug()">,
+  AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, Feature16bitD16HWBug)>;
+def NotHas16bitD16HWBug: Predicate<"Subtarget->useRealTrue16Insts() && "
+                                               "!Subtarget->has16bitD16HWBug()">,
+  AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, (not Feature16bitD16HWBug))>;
+
 def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
   AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 73acb1ddbd2a7..521cd208f5326 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -38,6 +38,10 @@ bool AMDGPUSubtarget::useRealTrue16Insts() const {
   return hasTrue16BitInsts() && EnableRealTrue16Insts;
 }
 
+bool AMDGPUSubtarget::has16bitD16HWBug() const {
+  return hasTrue16BitInsts() && useRealTrue16Insts() && Enable16bitD16HWBug;
+}
+
 // Returns the maximum per-workgroup LDS allocation size (in bytes) that still
 // allows the given function to achieve an occupancy of NWaves waves per
 // SIMD / EU, taking into account only the function's *maximum* workgroup size.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 57b757c990e1a..e5203486436e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -59,6 +59,7 @@ class AMDGPUSubtarget {
   bool HasCvtPkF16F32Inst = false;
   bool HasF32ToF16BF16ConversionSRInsts = false;
   bool EnableRealTrue16Insts = false;
+  bool Enable16bitD16HWBug = false;
   bool HasBF16TransInsts = false;
   bool HasBF16ConversionInsts = false;
   bool HasBF16PackedInsts = false;
@@ -224,6 +225,8 @@ class AMDGPUSubtarget {
   // supported and the support for fake True16 instructions is removed.
   bool useRealTrue16Insts() const;
 
+  bool has16bitD16HWBug() const;
+
   bool hasBF16TransInsts() const { return HasBF16TransInsts; }
 
   bool hasBF16ConversionInsts() const {
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 19f95c5ac4c37..c92a2413c8768 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1319,13 +1319,19 @@ let SubtargetPredicate = HasVmemPrefInsts in {
 }
 
 //===----------------------------------------------------------------------===//
-// Flat Patterns
+// Utilities
 //===----------------------------------------------------------------------===//
+class Mem_wrap<dag op, bit true16> {
+  dag ret = !if(true16, (EXTRACT_SUBREG op, lo16), op);
+}
 
+//===----------------------------------------------------------------------===//
+// Flat Patterns
+//===----------------------------------------------------------------------===//
 // Patterns for global loads with no offset.
-class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, bit true16> : GCNPat <
   (vt (node (FlatOffset i64:$vaddr, i32:$offset))),
-  (inst $vaddr, $offset)
+  Mem_wrap<(inst $vaddr, $offset), true16>.ret
 >;
 
 class FlatLoadPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1393,14 +1399,14 @@ class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, Valu
   (inst $saddr, $voffset, $offset, $cpol)
 >;
 
-class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, bit true16> : GCNPat <
   (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))),
-  (inst $vaddr, $offset)
+  Mem_wrap<(inst $vaddr, $offset), true16>.ret
 >;
 
-class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, bit true16> : GCNPat <
   (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
-  (inst $saddr, $voffset, $offset, $cpol)
+  Mem_wrap<(inst $saddr, $voffset, $offset, $cpol), true16>.ret
 >;
 
 class FlatLoadSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1546,9 +1552,9 @@ multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
   def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), noRtnNode, vt, data_vt>;
 }
 
-class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, bit true16> : GCNPat <
   (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))),
-  (inst $vaddr, $offset)
+  Mem_wrap<(inst $vaddr, $offset), true16>.ret
 >;
 
 class ScratchLoadSignedPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1566,9 +1572,9 @@ class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType
   (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $offset)
 >;
 
-class ScratchLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class ScratchLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, bit true16> : GCNPat <
   (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset))),
-  (inst $saddr, $offset)
+  Mem_wrap<(inst $saddr, $offset), true16>.ret
 >;
 
 class ScratchLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1587,9 +1593,9 @@ class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
   (inst getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
 >;
 
-class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, bit true16> : GCNPat <
   (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
-  (inst $vaddr, $saddr, $offset, $cpol)
+  Mem_wrap<(inst $vaddr, $saddr, $offset, $cpol), true16>.ret
 >;
 
 class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
@@ -1628,12 +1634,12 @@ multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
   }
 }
 
-multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
-  def : FlatLoadSignedPat <inst, node, vt> {
+multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, bit isTrue16 = 0> {
+  def : FlatLoadSignedPat <inst, node, vt, isTrue16> {
     let AddedComplexity = 10;
   }
 
-  def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+  def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt, isTrue16> {
     let AddedComplexity = 11;
   }
 }
@@ -1751,16 +1757,16 @@ multiclass GlobalFLATAtomicIntrPats<string inst, string node, ValueType vt,
   defm : GlobalFLATAtomicPats<inst, node, vt, data_vt, /* isIntr */ 1>;
 }
 
-multiclass ScratchFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
-  def : ScratchLoadSignedPat <inst, node, vt> {
+multiclass ScratchFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, bit isTrue16 = 0> {
+  def : ScratchLoadSignedPat <inst, node, vt, isTrue16> {
     let AddedComplexity = 25;
   }
 
-  def : ScratchLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+  def : ScratchLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt, isTrue16> {
     let AddedComplexity = 26;
   }
 
-  def : ScratchLoadSVaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> {
+  def : ScratchLoadSVaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt, isTrue16> {
     let SubtargetPredicate = HasFlatScratchSVSMode;
     let AddedComplexity = 27;
   }
@@ -1828,10 +1834,10 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu
   }
 }
 
-multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
-  def : FlatLoadPat <inst, node, vt>;
+multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, bit isTrue16 = 0> {
+  def : FlatLoadPat <inst, node, vt, isTrue16>;
 
-  def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+  def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt, isTrue16> {
     let AddedComplexity = 9;
     let SubtargetPredicate = HasFlatGVSMode;
   }
@@ -1907,6 +1913,13 @@ let True16Predicate = p in {
 }
 
 let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
+  defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+  defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>;
+  def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>;
+  def : FlatStorePat <FLAT_STORE_SHORT_t16, atomic_store_16_flat, i16>;
+} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
+
+let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace, NotHas16bitD16HWBug], True16Predicate = UseRealTrue16Insts in {
   defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
   defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
   defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
@@ -1915,11 +1928,18 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi
   defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
   defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
   defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
-  defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
-  defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>;
-  def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>;
-  def : FlatStorePat <FLAT_STORE_SHORT_t16, atomic_store_16_flat, i16>;
-} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
+}
+
+let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace, Has16bitD16HWBug], True16Predicate = UseRealTrue16Insts in {
+  defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i16, /*true16*/1>;
+  defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i16, /*true16*/1>;
+  defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i16, /*true16*/1>;
+  defm : FlatLoadPats <FLAT_LOAD_USHORT, load_flat, i16, /*true16*/1>;
+  defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16, /*true16*/1>;
+  defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16, /*true16*/1>;
+  defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16, /*true16*/1>;
+  defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16, /*true16*/1>; 
+}
 
 defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
 defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
@@ -2056,21 +2076,36 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>;
 }
 
 let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", sextloadi8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", load_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_aext_8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_zext_8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", atomic_load_sext_8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_nonext_16_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_zext_16_global, i16>;
-defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", truncstorei8_global, i16>;
-defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", store_global, i16>;
-defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", atomic_store_8_global, i16>;
-defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", atomic_store_16_global, i16>;
+  defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", truncstorei8_global, i16>;
+  defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", store_global, i16>;
+  defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", atomic_store_8_global, i16>;
+  defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", atomic_store_16_global, i16>;
 } // end OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts
 
+let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits, NotHas16bitD16HWBug], True16Predicate = UseRealTrue16Insts in {
+  defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>;
+  defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>;
+  defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", sextloadi8_global, i16>;
+  defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", load_global, i16>;
+  defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_aext_8_global, i16>;
+  defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_zext_8_global, i16>;
+  defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", atomic_load_sext_8_global, i16>;
+  defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_nonext_16_global, i16>;
+  defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_zext_16_global, i16>;
+}
+
+let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits, Has16bitD16HWBug], True16Predicate = UseRealTrue16Insts in {
+  defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i16, /*true16*/1>;
+  defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16, /*true16*/1>;
+  defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16, /*true16*/1>;
+  defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, load_global, i16, /*true16*/1>;
+  defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i16, /*true16*/1>;
+  defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i16, /*true16*/1>;
+  defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, atomic_load_sext_8_global, i16, /*true16*/1>;
+  defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16, /*true16*/1>;
+  defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16, /*true16*/1>;
+}
+
 foreach vt = Reg32Types.types in {
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, load_global, vt>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, store_global, vt>;
@@ -2297,12 +2332,20 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i16>;
 }
 
 let True16Predicate = UseRealTrue16Insts in {
-defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", extloadi8_private, i16>;
-defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", zextloadi8_private, i16>;
-defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SBYTE_D16", sextloadi8_private, i16>;
-defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SHORT_D16", load_private, i16>;
-defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_SHORT", store_private, i16>;
-defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>;
+  let OtherPredicates = [NotHas16bitD16HWBug] in {
+	defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", extloadi8_private, i16>;
+	defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", zextloadi8_private, i16>;
+	defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SBYTE_D16", sextloadi8_private, i16>;
+	defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SHORT_D16", load_private, i16>;
+  }
+  let OtherPredicates = [Has16bitD16HWBug] in {
+	defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, extloadi8_private, i16, /*true16*/1>;
+	defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, zextloadi8_private, i16, /*true16*/1>;
+	defm : ScratchFLATLoadPats <SCRATCH_LOAD_SBYTE, sextloadi8_private, i16, /*true16*/1>;
+	defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, load_private, i16, /*true16*/1>;
+  }
+  defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_SHORT", store_private, i16>;
+  defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>;
 } // End True16Predicate = UseRealTrue16Insts
 
 foreach vt = Reg32Types.types in {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index d03d6a8940b2f..1dc53cec8df85 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -15369,876 +15369,913 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:536
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:532
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:528
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:524
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:520
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:516
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:512
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:508
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:504
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:500
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:496
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:492
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:488
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:484
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:480
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:476
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:472
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:468
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:464
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:460
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:456
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:452
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:448
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:444
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:440
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:436
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:432
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:428
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:424
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:420
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:416
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:412
+; GFX11-TRUE16-NEXT:    s_clause 0x4
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:408
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:404
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:400
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:396
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:392
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v102, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v160, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v160, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v161, off, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v112, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v32, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v113, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v56, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v114, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v33, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v115, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v57, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v116, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v34, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v117, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v58, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v118, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v35, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v119, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v59, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v128, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v36, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v129, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v60, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v130, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v37, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v131, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v61, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v132, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v38, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v133, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v62, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v134, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v39, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v135, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v162, off, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v163, off, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v164, off, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v144, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v145, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v146, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v49, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v147, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v73, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v148, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v50, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:388
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v52, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v53, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v54, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v55, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v64, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v65, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v66, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v67, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v74, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v75, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v76, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v77, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v78, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v79, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v88, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v89, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v90, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v91, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v92, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v93, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v95, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v106, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v107, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v108, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v149, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v150, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v151, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v160, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v163, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v164, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v165, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v166, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v167, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v176, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v177, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v178, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v179, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v180, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v181, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v182, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v183, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v40, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v41, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v42, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v43, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v44, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v45, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.h, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v46, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v47, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v84.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.l, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v56.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v57.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v58.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v36.h, 8, v59.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v36.l, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.h, 8, v60.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.h, 8, v61.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.l, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v62.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v63.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v48.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v72.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v49.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v73.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v50.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v51
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v81.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.l, 8, v52.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v53.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v54.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v83.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.l, 8, v55.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.h, 8, v64.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v85.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.h, 8, v65.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v66.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v87.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v67.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v74.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v97.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.l, 8, v75.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v98.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.h, 8, v76.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v77.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v78.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.l, 8, v79.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v102.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.h, 8, v88.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v89.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v160.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v90.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v91.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v161.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v92.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v93.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v162.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v94.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v95.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v104.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v164.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v105.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v164.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v106.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v165.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v107.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v31.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v108.l
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB14_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
-; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX11-TRUE16-NEXT:  .LBB14_2: ; %end
-; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB14_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v1.h, v150.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v117.l
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB14_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v101.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.h, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v0.l, v103.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v103.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v102.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v101, v0
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v1.h, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v100.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v2.l, v100.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v97.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v101, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v3.l, v99.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v99.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v87.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v101, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v4.l, v96.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v85.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v69.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v15.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v82.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v71.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v68.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v18.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v66.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v54.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v21.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v53.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v47.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v46.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v101, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v5.l, v87.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v81.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v45.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v44.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v101, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v6.l, v85.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v43.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v42.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v41.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v101, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v7.l, v83.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v40.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v183.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v182.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v101, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v8.l, v81.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v68.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v67.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v181.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v180.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v101, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v9.l, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v179.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v66.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v178.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v177.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v101, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v10.l, v69.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v65.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v176.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v167.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v166.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v101, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v11.l, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v55.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v165.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v164.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v101, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v12.l, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v163.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v162.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v101, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v13.l, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v160.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v101, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v14.l, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v51.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v149.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v148.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v101, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v15.l, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v147.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v50.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v146.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v101, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v16.l, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v49.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v144.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v101, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v17.l, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v39.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v133.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v132.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v101, v17
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v18.l, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v101, v18
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v19.l, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v128.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v101, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v20.l, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v116.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v101, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v21.l, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v101, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v22.l, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v112.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v101, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v23.l, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v32.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v24.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v101, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v24.l, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v101, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v25.l, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v101, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v26.l, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v101, v26
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v27.l, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v28.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v101, v27
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v28.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v101, v28
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v29.l, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v31.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v101, v29
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v30.l, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v101, v30
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v31.l, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v101, v31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:  .LBB14_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB14_2
-; GFX11-TRUE16-NEXT:  .LBB14_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB14_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v101.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v101.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v98.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v98.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v145.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v103.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v102.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v102.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v97.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v96.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v134.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v86.h, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v31, v3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v86.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v148.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v100.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v148.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v100.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v31, v4
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v31.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v99.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v99.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v84.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v83.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v31, v5
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v130.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v82.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v82.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v96.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v97.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v31, v6
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v87.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v87.h, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v71.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v71.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v31, v7
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v117.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v70.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v69.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v85.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v85.h, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v31, v8
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v132.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v83.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v84.l, v6.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v47.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v46.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v31, v9
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v112.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v45.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v44.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v81.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v81.h, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v31, v10
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v9.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v103.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v80.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v80.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v43.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v42.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v31, v11
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v41.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v40.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v69.h, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v70.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v31, v12
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v68.l, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v68.h, v10.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v183.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v182.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v31, v13
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v96.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v181.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v180.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v67.l, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v67.h, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v31, v14
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v86.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v66.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v66.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v179.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v178.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v31, v15
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v84.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v177.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v176.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v65.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v65.h, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v31, v16
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, 0x300, v15.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v64.l, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v64.h, v14.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v167.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v166.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v31, v17
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v15.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v80.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v165.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v164.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v55.l, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v55.h, v15.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v16.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v31, v18
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v17.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.h, 0x300, v17.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v54.l, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v54.h, v16.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v163.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v162.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v31, v19
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v17.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, 0x300, v17.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v18.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v161.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v160.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v53.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v53.h, v17.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v31, v20
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v19.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v19.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v52.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v52.h, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v151.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v150.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v31, v21
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v19.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, 0x300, v19.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v20.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v149.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v148.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v51.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v51.h, v19.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v20.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v31, v22
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v21.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, 0x300, v21.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v50.l, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v50.h, v20.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v147.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v146.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v31, v23
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v21.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, 0x300, v21.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v22.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v145.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v144.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v49.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v49.h, v21.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v31, v24
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v23.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.h, 0x300, v23.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v48.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v48.h, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v135.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v134.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v31, v25
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v23.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, 0x300, v23.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v133.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v132.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v39.l, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v39.h, v23.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v24.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v26
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v25.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v25.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v38.l, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v38.h, v24.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v130.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v27
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v25.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, 0x300, v25.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v26.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v129.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v128.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v37.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v37.h, v25.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v26.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v31, v28
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v27.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.h, 0x300, v27.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v36.l, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v36.h, v26.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v118.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v31, v29
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v27.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, 0x300, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v117.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v116.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v35.l, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v35.h, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v31, v30
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v31, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v35.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v34.l, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v34.h, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v115.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v114.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v31, v35
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v34.h, 0x300, v29.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v113.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v112.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v33.l, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.h, v33.h, v29.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v31, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v33.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.h, 0x300, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v32.l, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v32.h, v30.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v31, v33
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v32.l
@@ -16246,7 +16283,48 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v32.h, 0x300, v32.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v31.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT:  .LBB14_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:392
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:396
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:400
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:404
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:408
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:412
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:416
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:420
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:424
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:428
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:432
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:436
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:440
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:444
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:448
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:452
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:456
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:460
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:464
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:468
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:472
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:476
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:480
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:484
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:488
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:492
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:496
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:500
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:504
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:508
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:512
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:516
+; GFX11-TRUE16-NEXT:    s_clause 0x4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:520
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:524
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:528
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:532
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:536
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32:
@@ -20512,1887 +20590,946 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:464
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:460
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:456
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:452
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:448
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:444
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:440
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:436
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:432
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:428
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:424
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:420
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:416
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:412
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:408
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:404
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:400
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:396
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:392
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:320
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v2, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v8, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v10, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v16, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v18, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v20, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v22, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v24, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v26, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v28, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v30, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:152
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v113, off, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v116, off, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v117, off, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v128, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v130, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v94, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v95, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v104, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v105, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v106, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v107, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v108, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v109, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v110, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v111, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:260
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v102, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v103, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v112, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v118, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v164, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v166, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v180, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v181, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v182, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v183, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v45, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v46, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v47, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v56, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v62, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(60)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(58)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB15_4
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v53
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v90
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v91
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s11, s26, 0xff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v50
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v49
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v76
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v77
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v39
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v88
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v63
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v72
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v73
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v33
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v74
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v75
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v62
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v57
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v58
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v56
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v47
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v60
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v46
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v45
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v61
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v40
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v41
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v42
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v180
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v43
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v166
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v165
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v167
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v176
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v164
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v163
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v177
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v178
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v162
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v148
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v179
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v149
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v150
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v151
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v118
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v160
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v161
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v112
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v103
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v132
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v133
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v102
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v101
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v134
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v135
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v86
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v144
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v119
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v85
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v84
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v128
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v129
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v130
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v81
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v80
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v113
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v114
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v70
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v115
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v116
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v68
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v117
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v87
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v96
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v97
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v64
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v98
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v99
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s10, 16
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v51
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v52
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v93
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v92
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB15_3
-; GFX11-TRUE16-NEXT:  .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v55
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v54
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v52
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v51
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v38
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v34
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v89, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v90, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v91, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v92, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v93, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v88, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v74, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v75, v12
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v3, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v50
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v49
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v48
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v36
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 3, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v46
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 3, v181
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 3, v180
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v76, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v77, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v78, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v79, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v63, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v72, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v73, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v61, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v43, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v44, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v14, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v32
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v62
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v56
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v45
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 3, v183
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 3, v182
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 3, v162
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 3, v145
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 3, v118
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v57, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v58, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v59, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v60, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v40, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v41, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v42, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v179, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v160, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v161, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v19, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v166
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v165
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v164
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 3, v148
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 3, v147
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v146
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 3, v100
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 3, v83
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 3, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v167, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v176, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v177, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v178, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v150, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v151, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v144, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v130, v26
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v131, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v19, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v24, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v112
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v103
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v102
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v101
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 3, v86
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 3, v85
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 3, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 3, v69
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v132, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v133, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v134, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v135, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v119, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v128, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v129, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v117, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v24, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v29, v25
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v81
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v80
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v71
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v70
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 3, v68
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 3, v67
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 3, v66
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 3, v65
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 3, v64
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v113, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v114, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v115, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v116, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v87, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v96, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v97, v30
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v98, v31
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, v99, v32
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v29, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v30
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v32
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:  .LBB15_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:392
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:396
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:400
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:404
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:408
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:412
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:416
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:420
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:424
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:428
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:432
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:460
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:464
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:468
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:472
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:476
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB15_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT:    s_branch .LBB15_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:464
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:460
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:456
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:452
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:448
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:444
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:440
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:436
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:432
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:428
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:424
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:420
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:416
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:412
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:408
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:404
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:400
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:396
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:392
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:388
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v107, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v108, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v109, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v110, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v111, s32 offset:320
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:112
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:128
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:136
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:144
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:152
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:160
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:168
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v114, off, s32 offset:176
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32 offset:184
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:192
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:200
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v128, off, s32 offset:208
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:216
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:224
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:232
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:240
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v95, off, s32 offset:248
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v104, off, s32 offset:256
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v105, off, s32 offset:264
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v106, off, s32 offset:272
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v107, off, s32 offset:280
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v108, off, s32 offset:288
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v109, off, s32 offset:296
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v110, off, s32 offset:304
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v111, off, s32 offset:312
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:308
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:300
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:292
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:284
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:276
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:268
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:260
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:236
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:228
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:220
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:212
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:204
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:196
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:188
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:180
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:172
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:164
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:156
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:148
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v145, off, s32 offset:140
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v146, off, s32 offset:132
-; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v147, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v148, off, s32 offset:116
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v162, off, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v163, off, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v164, off, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:84
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v166, off, s32 offset:76
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:68
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v181, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v45, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v46, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v47, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v56, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v62, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB15_4
-; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v54
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v53
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v90
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v91
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s11, s26, 0xff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v50
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v49
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v76
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v77
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v48
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v39
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v88
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v63
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v36
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v72
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v73
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v33
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v74
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v75
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v62
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v57
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v58
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v56
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v47
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v60
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v46
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v45
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v61
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v40
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v41
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v42
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v180
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v43
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v166
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v165
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v167
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v176
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v164
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v163
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v177
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v178
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v162
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v148
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v179
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v149
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v147
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v150
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v151
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v118
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v160
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v161
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v112
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v103
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v132
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v133
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v102
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v101
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v134
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v135
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v86
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v144
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v119
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v85
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v84
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v128
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v129
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v83
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v82
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v130
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v131
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v81
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v80
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v113
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v114
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v71
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v70
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v115
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v116
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v69
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v68
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v117
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v87
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v67
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v66
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v96
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v97
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v65
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v98
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v99
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v55
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s10, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v51
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v52
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v93
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v92
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v2, v3
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB15_3
-; GFX11-FAKE16-NEXT:  .LBB15_2: ; %cmp.true
-; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v55
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v54
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v52
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v51
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v38
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v34
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v89, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v90, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v92, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v93, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v88, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v74, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v75, v12
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v3, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v50
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v49
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v48
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v39
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v37
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v36
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 3, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v46
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 3, v181
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 3, v180
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v76, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v77, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v78, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v79, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v63, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v72, v8
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v73, v10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v61, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v43, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v44, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v14, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v32
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v62
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v56
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v47
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v45
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 3, v183
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 3, v182
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 3, v162
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 3, v145
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 3, v118
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v57, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v58, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v59, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v60, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v40, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v41, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v42, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v160, v21
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v161, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v18
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v19, v15
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v166
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v165
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v164
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v163
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 3, v148
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 3, v147
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 3, v146
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 3, v100
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 3, v83
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 3, v82
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v167, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v176, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v177, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v178, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v149, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v150, v18
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v151, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v130, v26
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v131, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v23
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v24, v20
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v112
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v103
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v102
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v101
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 3, v86
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 3, v85
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, 3, v84
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 3, v69
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v132, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v133, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v134, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v135, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v119, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v128, v23
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v129, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v117, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v28
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v29, v25
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v81
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v80
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v71
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v70
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 3, v68
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 3, v67
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 3, v66
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 3, v65
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 3, v64
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v113, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v114, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v115, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v116, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v87, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v96, v28
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v97, v30
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v98, v31
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v99, v32
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v34, v30
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v32
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT:  .LBB15_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v108, off, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v107, off, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v106, off, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v105, off, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v104, off, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v95, off, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:388
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:392
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:396
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:400
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:404
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:408
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:412
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:416
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:420
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:424
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:428
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:432
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:460
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:464
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:468
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:472
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:476
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB15_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT:    s_branch .LBB15_2
+; GFX11-LABEL: bitcast_v128i8_to_v32i32_scalar:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:476
+; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:472
+; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:468
+; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:464
+; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:460
+; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:456
+; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:452
+; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:448
+; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:444
+; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:440
+; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:436
+; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:432
+; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:428
+; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:424
+; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:420
+; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:416
+; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:412
+; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:408
+; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:404
+; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:400
+; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:396
+; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:392
+; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:388
+; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:384
+; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:380
+; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:376
+; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:372
+; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:368
+; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:364
+; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:360
+; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:356
+; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:352
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:348
+; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:344
+; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:340
+; GFX11-NEXT:    scratch_store_b32 off, v107, s32 offset:336
+; GFX11-NEXT:    scratch_store_b32 off, v108, s32 offset:332
+; GFX11-NEXT:    scratch_store_b32 off, v109, s32 offset:328
+; GFX11-NEXT:    scratch_store_b32 off, v110, s32 offset:324
+; GFX11-NEXT:    scratch_store_b32 off, v111, s32 offset:320
+; GFX11-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
+; GFX11-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
+; GFX11-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
+; GFX11-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
+; GFX11-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
+; GFX11-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
+; GFX11-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:316
+; GFX11-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u16 v16, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u16 v18, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_u16 v20, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_u16 v22, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_u16 v24, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_u16 v26, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_u16 v28, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_u16 v30, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_u16 v31, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_u16 v87, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:136
+; GFX11-NEXT:    scratch_load_u16 v97, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_u16 v98, off, s32 offset:152
+; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_u16 v113, off, s32 offset:168
+; GFX11-NEXT:    scratch_load_u16 v114, off, s32 offset:176
+; GFX11-NEXT:    scratch_load_u16 v115, off, s32 offset:184
+; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:192
+; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:200
+; GFX11-NEXT:    scratch_load_u16 v128, off, s32 offset:208
+; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:216
+; GFX11-NEXT:    scratch_load_u16 v130, off, s32 offset:224
+; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:232
+; GFX11-NEXT:    scratch_load_u16 v94, off, s32 offset:240
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_u16 v95, off, s32 offset:248
+; GFX11-NEXT:    scratch_load_u16 v104, off, s32 offset:256
+; GFX11-NEXT:    scratch_load_u16 v105, off, s32 offset:264
+; GFX11-NEXT:    scratch_load_u16 v106, off, s32 offset:272
+; GFX11-NEXT:    scratch_load_u16 v107, off, s32 offset:280
+; GFX11-NEXT:    scratch_load_u16 v108, off, s32 offset:288
+; GFX11-NEXT:    scratch_load_u16 v109, off, s32 offset:296
+; GFX11-NEXT:    scratch_load_u16 v110, off, s32 offset:304
+; GFX11-NEXT:    scratch_load_u16 v111, off, s32 offset:312
+; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:308
+; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:300
+; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:292
+; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:284
+; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:276
+; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:268
+; GFX11-NEXT:    scratch_load_u16 v70, off, s32 offset:260
+; GFX11-NEXT:    scratch_load_u16 v71, off, s32 offset:252
+; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:244
+; GFX11-NEXT:    scratch_load_u16 v81, off, s32 offset:236
+; GFX11-NEXT:    scratch_load_u16 v82, off, s32 offset:228
+; GFX11-NEXT:    scratch_load_u16 v83, off, s32 offset:220
+; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:212
+; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:204
+; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:196
+; GFX11-NEXT:    scratch_load_u16 v100, off, s32 offset:188
+; GFX11-NEXT:    scratch_load_u16 v101, off, s32 offset:180
+; GFX11-NEXT:    scratch_load_u16 v102, off, s32 offset:172
+; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:164
+; GFX11-NEXT:    scratch_load_u16 v112, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_u16 v118, off, s32 offset:148
+; GFX11-NEXT:    scratch_load_u16 v145, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_u16 v146, off, s32 offset:132
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_load_u16 v147, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_u16 v148, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_u16 v162, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_u16 v163, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_u16 v164, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_u16 v165, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_u16 v166, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_u16 v180, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_u16 v181, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u16 v182, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_u16 v183, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u16 v45, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u16 v46, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u16 v47, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v56, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v62, off, s32 offset:4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v89, 8, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v90, 8, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v91, 8, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v92, 8, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v93, 8, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v76, 8, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v77, 8, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v78, 8, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v79, 8, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v88, 8, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v63, 8, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v72, 8, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v73, 8, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v74, 8, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v75, 8, v29
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v57, 8, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v58, 8, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v59, 8, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v60, 8, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v61, 8, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v40, 8, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v41, 8, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v42, 8, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v43, 8, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v44, 8, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v167, 8, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v176, 8, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v177, 8, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v178, 8, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v179, 8, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v149, 8, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v150, 8, v87
+; GFX11-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v151, 8, v96
+; GFX11-NEXT:    s_waitcnt vmcnt(60)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v160, 8, v97
+; GFX11-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v161, 8, v98
+; GFX11-NEXT:    s_waitcnt vmcnt(58)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v132, 8, v99
+; GFX11-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v133, 8, v113
+; GFX11-NEXT:    s_waitcnt vmcnt(56)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v134, 8, v114
+; GFX11-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v135, 8, v115
+; GFX11-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v144, 8, v116
+; GFX11-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v119, 8, v117
+; GFX11-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v128, 8, v128
+; GFX11-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v129, 8, v129
+; GFX11-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v130, 8, v130
+; GFX11-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v131, 8, v131
+; GFX11-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v113, 8, v94
+; GFX11-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v114, 8, v95
+; GFX11-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v115, 8, v104
+; GFX11-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v116, 8, v105
+; GFX11-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v117, 8, v106
+; GFX11-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v87, 8, v107
+; GFX11-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v96, 8, v108
+; GFX11-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v97, 8, v109
+; GFX11-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v98, 8, v110
+; GFX11-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v99, 8, v111
+; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cbranch_scc0 .LBB15_4
+; GFX11-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v54
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v53
+; GFX11-NEXT:    s_and_b32 s5, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s29, 8
+; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v90
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v91
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-NEXT:    s_and_b32 s11, s26, 0xff
+; GFX11-NEXT:    v_or_b32_e32 v5, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v50
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v49
+; GFX11-NEXT:    s_lshl_b32 s12, s27, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v76
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v77
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v48
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v39
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v78
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v79
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v8, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v38
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v37
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v88
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v63
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v9, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v36
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v72
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v73
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v10, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v33
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v74
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v75
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v62
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v57
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v58
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v56
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v47
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v59
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v60
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v13, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v46
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v45
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v61
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v40
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v14, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v183
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v182
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v41
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v42
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v15, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v181
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v180
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v43
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v44
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v16, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v166
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v165
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v167
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v176
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v164
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v163
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v177
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v178
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v18, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v162
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v148
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v179
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v149
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v19, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v147
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v146
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v150
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v151
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v20, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v145
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v118
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v160
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v161
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v21, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v112
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v103
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v132
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v133
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v102
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v101
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v134
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v135
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v23, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v100
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v86
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v144
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v119
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v24, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v85
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v84
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v128
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v129
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v25, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v83
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v82
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v130
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v131
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v26, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v81
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v80
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v113
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v114
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v71
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v70
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v115
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v116
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v28, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v69
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v68
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v117
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v87
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v29, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v67
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v66
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v96
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v97
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v30, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v65
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v98
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v99
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v31, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v55
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v89
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX11-NEXT:    s_and_b32 s9, s9, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s10, s10, 16
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v51
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v52
+; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v93
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v92
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v6, v2, v3
+; GFX11-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_vccnz .LBB15_3
+; GFX11-NEXT:  .LBB15_2: ; %cmp.true
+; GFX11-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_and_b32 s1, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s2, s17, 8
+; GFX11-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-NEXT:    s_or_b32 s1, s2, s1
+; GFX11-NEXT:    s_and_b32 s2, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s19, 8
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_and_b32 s3, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s21, 8
+; GFX11-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-NEXT:    s_or_b32 s1, s1, s2
+; GFX11-NEXT:    s_or_b32 s2, s4, s3
+; GFX11-NEXT:    s_and_b32 s3, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s23, 8
+; GFX11-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-NEXT:    s_or_b32 s3, s4, s3
+; GFX11-NEXT:    s_and_b32 s4, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s25, 8
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_addk_i32 s3, 0x300
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT:    s_and_b32 s4, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s27, 8
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v55
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v54
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v52
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v51
+; GFX11-NEXT:    s_or_b32 s3, s3, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v53
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v38
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v34
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v33
+; GFX11-NEXT:    v_or_b32_e32 v0, v89, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v90, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v91, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v92, v3
+; GFX11-NEXT:    v_or_b32_e32 v4, v93, v4
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT:    v_or_b32_e32 v7, v88, v7
+; GFX11-NEXT:    v_or_b32_e32 v11, v74, v11
+; GFX11-NEXT:    v_or_b32_e32 v12, v75, v12
+; GFX11-NEXT:    s_and_b32 s5, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s29, 8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-NEXT:    v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    v_or_b32_e32 v6, v3, v6
+; GFX11-NEXT:    v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v50
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v49
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v48
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v39
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v37
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v36
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 3, v35
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v46
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 3, v181
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 3, v180
+; GFX11-NEXT:    v_or_b32_e32 v0, v76, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v77, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v78, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v79, v3
+; GFX11-NEXT:    v_or_b32_e32 v7, v63, v7
+; GFX11-NEXT:    v_or_b32_e32 v8, v72, v8
+; GFX11-NEXT:    v_or_b32_e32 v10, v73, v10
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x300, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT:    v_or_b32_e32 v12, v61, v12
+; GFX11-NEXT:    v_or_b32_e32 v16, v43, v16
+; GFX11-NEXT:    v_or_b32_e32 v17, v44, v17
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v8, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v9, v9, v13
+; GFX11-NEXT:    v_or_b32_e32 v10, v14, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v62
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v56
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v47
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v45
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 3, v183
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 3, v182
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT:    v_or_b32_e32 v16, v16, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 3, v162
+; GFX11-NEXT:    v_add_nc_u32_e32 v21, 3, v145
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 3, v118
+; GFX11-NEXT:    v_or_b32_e32 v0, v57, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v58, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v59, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v60, v3
+; GFX11-NEXT:    v_or_b32_e32 v12, v40, v12
+; GFX11-NEXT:    v_or_b32_e32 v13, v41, v13
+; GFX11-NEXT:    v_or_b32_e32 v15, v42, v15
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x300, v13
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT:    v_or_b32_e32 v17, v179, v17
+; GFX11-NEXT:    v_or_b32_e32 v21, v160, v21
+; GFX11-NEXT:    v_or_b32_e32 v22, v161, v22
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x300, v21
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT:    v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v13, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v14, v14, v18
+; GFX11-NEXT:    v_or_b32_e32 v15, v19, v15
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v166
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v165
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v164
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v163
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 3, v148
+; GFX11-NEXT:    v_add_nc_u32_e32 v18, 3, v147
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 3, v146
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-NEXT:    v_or_b32_e32 v21, v21, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 3, v100
+; GFX11-NEXT:    v_add_nc_u32_e32 v26, 3, v83
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 3, v82
+; GFX11-NEXT:    v_or_b32_e32 v0, v167, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v176, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v177, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v178, v3
+; GFX11-NEXT:    v_or_b32_e32 v17, v149, v17
+; GFX11-NEXT:    v_or_b32_e32 v18, v150, v18
+; GFX11-NEXT:    v_or_b32_e32 v20, v151, v20
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x300, v18
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x300, v20
+; GFX11-NEXT:    v_or_b32_e32 v22, v144, v22
+; GFX11-NEXT:    v_or_b32_e32 v26, v130, v26
+; GFX11-NEXT:    v_or_b32_e32 v27, v131, v27
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v26, 0x300, v26
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT:    v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v18, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v19, v19, v23
+; GFX11-NEXT:    v_or_b32_e32 v20, v24, v20
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v112
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v103
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v102
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v101
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 3, v86
+; GFX11-NEXT:    v_add_nc_u32_e32 v23, 3, v85
+; GFX11-NEXT:    v_add_nc_u32_e32 v25, 3, v84
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v23
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-NEXT:    v_or_b32_e32 v26, v26, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 3, v69
+; GFX11-NEXT:    v_or_b32_e32 v0, v132, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v133, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v134, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v135, v3
+; GFX11-NEXT:    v_or_b32_e32 v22, v119, v22
+; GFX11-NEXT:    v_or_b32_e32 v23, v128, v23
+; GFX11-NEXT:    v_or_b32_e32 v25, v129, v25
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x300, v23
+; GFX11-NEXT:    v_add_nc_u32_e32 v25, 0x300, v25
+; GFX11-NEXT:    v_or_b32_e32 v27, v117, v27
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT:    v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v23, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v24, v24, v28
+; GFX11-NEXT:    v_or_b32_e32 v25, v29, v25
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v81
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v80
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v71
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v70
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 3, v68
+; GFX11-NEXT:    v_add_nc_u32_e32 v28, 3, v67
+; GFX11-NEXT:    v_add_nc_u32_e32 v30, 3, v66
+; GFX11-NEXT:    v_add_nc_u32_e32 v31, 3, v65
+; GFX11-NEXT:    v_add_nc_u32_e32 v32, 3, v64
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v31
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v32
+; GFX11-NEXT:    v_or_b32_e32 v0, v113, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v114, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v115, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v116, v3
+; GFX11-NEXT:    v_or_b32_e32 v27, v87, v27
+; GFX11-NEXT:    v_or_b32_e32 v28, v96, v28
+; GFX11-NEXT:    v_or_b32_e32 v30, v97, v30
+; GFX11-NEXT:    v_or_b32_e32 v31, v98, v31
+; GFX11-NEXT:    v_or_b32_e32 v32, v99, v32
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x300, v28
+; GFX11-NEXT:    v_add_nc_u32_e32 v30, 0x300, v30
+; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x300, v31
+; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x300, v32
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-NEXT:    v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v28, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v29, v29, v33
+; GFX11-NEXT:    v_or_b32_e32 v30, v34, v30
+; GFX11-NEXT:    v_or_b32_e32 v31, v31, v32
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:  .LBB15_3: ; %end
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:320
+; GFX11-NEXT:    scratch_load_b32 v110, off, s32 offset:324
+; GFX11-NEXT:    scratch_load_b32 v109, off, s32 offset:328
+; GFX11-NEXT:    scratch_load_b32 v108, off, s32 offset:332
+; GFX11-NEXT:    scratch_load_b32 v107, off, s32 offset:336
+; GFX11-NEXT:    scratch_load_b32 v106, off, s32 offset:340
+; GFX11-NEXT:    scratch_load_b32 v105, off, s32 offset:344
+; GFX11-NEXT:    scratch_load_b32 v104, off, s32 offset:348
+; GFX11-NEXT:    scratch_load_b32 v95, off, s32 offset:352
+; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:356
+; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:360
+; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:364
+; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:368
+; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:372
+; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:376
+; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:380
+; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:384
+; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:388
+; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:392
+; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:396
+; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:400
+; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:404
+; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:408
+; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:412
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:416
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:420
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:424
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:428
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:432
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:436
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:440
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:444
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:448
+; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:452
+; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:456
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:460
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:464
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:468
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:472
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:476
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-NEXT:  .LBB15_4:
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-NEXT:    s_branch .LBB15_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -52038,876 +51175,913 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:536
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:532
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:528
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:524
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:520
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:516
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:512
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:508
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:504
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:500
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:496
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:492
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:488
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:484
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:480
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:476
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:472
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:468
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:464
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:460
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:456
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:452
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:448
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:444
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:440
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:436
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:432
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:428
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:424
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:420
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:416
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:412
+; GFX11-TRUE16-NEXT:    s_clause 0x4
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:408
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:404
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:400
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:396
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:392
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v102, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v160, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v160, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v161, off, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v112, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v32, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v113, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v56, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v114, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v33, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v115, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v57, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v116, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v34, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v117, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v58, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v118, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v35, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v119, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v59, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v128, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v36, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v129, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v60, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v130, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v37, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v131, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v61, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v132, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v38, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v133, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v62, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v134, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v39, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v135, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v162, off, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v163, off, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v164, off, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v144, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v145, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v146, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v49, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v147, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v73, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v148, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v50, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:388
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v52, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v53, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v54, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v55, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v64, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v65, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v66, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v67, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v74, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v75, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v76, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v77, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v78, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v79, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v88, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v89, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v90, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v91, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v92, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v93, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v95, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v106, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v107, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v108, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v149, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v150, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v151, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v160, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v163, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v164, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v165, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v166, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v167, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v176, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v177, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v178, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v179, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v180, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v181, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v182, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v183, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v40, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v41, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v42, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v43, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v44, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v45, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.h, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v46, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v47, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v84.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.l, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v56.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v57.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v58.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v36.h, 8, v59.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v36.l, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.h, 8, v60.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.h, 8, v61.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.l, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v62.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v63.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v48.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v72.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v49.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v73.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v50.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v51
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v81.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.l, 8, v52.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v53.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v54.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v83.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.l, 8, v55.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.h, 8, v64.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v85.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.h, 8, v65.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v66.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v87.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v67.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v74.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v97.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.l, 8, v75.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v98.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.h, 8, v76.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v77.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v78.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.l, 8, v79.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v102.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.h, 8, v88.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v89.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v160.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v90.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v91.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v161.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v92.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v93.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v162.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v94.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v95.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v104.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v164.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v105.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v164.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v106.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v165.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v107.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v31.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v108.l
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB38_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
-; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB38_4
-; GFX11-TRUE16-NEXT:  .LBB38_2: ; %end
-; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB38_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v1.h, v150.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v117.l
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB38_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v101.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.h, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v0.l, v103.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v103.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v102.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v101, v0
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v1.h, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v100.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v2.l, v100.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v97.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v101, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v3.l, v99.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v99.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v87.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v101, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v4.l, v96.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v85.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v69.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v15.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v82.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v71.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v68.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v18.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v66.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v54.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v21.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v53.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v47.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v46.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v101, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v5.l, v87.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v81.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v45.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v44.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v101, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v6.l, v85.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v43.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v42.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v41.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v101, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v7.l, v83.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v40.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v183.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v182.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v101, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v8.l, v81.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v68.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v67.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v181.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v180.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v101, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v9.l, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v179.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v66.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v178.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v177.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v101, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v10.l, v69.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v65.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v176.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v167.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v166.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v101, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v11.l, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v55.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v165.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v164.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v101, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v12.l, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v163.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v162.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v101, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v13.l, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v160.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v101, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v14.l, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v51.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v149.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v148.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v101, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v15.l, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v147.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v50.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v146.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v101, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v16.l, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v49.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v144.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v101, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v17.l, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v39.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v133.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v132.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v101, v17
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v18.l, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v101, v18
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v19.l, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v128.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v101, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v20.l, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v116.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v101, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v21.l, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v101, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v22.l, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v112.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v101, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v23.l, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v32.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v24.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v101, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v24.l, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v101, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v25.l, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v101, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v26.l, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v101, v26
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v27.l, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v28.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v101, v27
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v28.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v101, v28
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v29.l, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v31.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v101, v29
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v30.l, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v101, v30
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v31.l, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v101, v31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:  .LBB38_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB38_2
-; GFX11-TRUE16-NEXT:  .LBB38_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB38_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v101.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v101.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v98.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v98.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v145.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v103.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v102.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v102.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v97.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v96.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v134.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v86.h, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v31, v3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v86.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v148.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v100.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v148.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v100.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v31, v4
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v31.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v99.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v99.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v84.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v83.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v31, v5
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v130.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v82.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v82.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v96.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v97.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v31, v6
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v87.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v87.h, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v71.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v71.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v31, v7
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v117.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v70.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v69.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v85.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v85.h, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v31, v8
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v132.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v83.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v84.l, v6.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v47.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v46.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v31, v9
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v112.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v45.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v44.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v81.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v81.h, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v31, v10
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v9.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v103.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v80.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v80.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v43.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v42.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v31, v11
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v41.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v40.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v69.h, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v70.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v31, v12
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v68.l, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v68.h, v10.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v183.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v182.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v31, v13
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v96.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v181.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v180.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v67.l, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v67.h, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v31, v14
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v86.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v66.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v66.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v179.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v178.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v31, v15
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v84.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v177.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v176.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v65.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v65.h, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v31, v16
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, 0x300, v15.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v64.l, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v64.h, v14.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v167.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v166.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v31, v17
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v15.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v80.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v165.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v164.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v55.l, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v55.h, v15.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v16.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v31, v18
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v17.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.h, 0x300, v17.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v54.l, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v54.h, v16.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v163.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v162.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v31, v19
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v17.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, 0x300, v17.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v18.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v161.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v160.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v53.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v53.h, v17.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v31, v20
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v19.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v19.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v52.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v52.h, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v151.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v150.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v31, v21
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v19.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, 0x300, v19.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v20.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v149.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v148.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v51.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v51.h, v19.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v20.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v31, v22
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v21.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, 0x300, v21.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v50.l, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v50.h, v20.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v147.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v146.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v31, v23
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v21.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, 0x300, v21.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v22.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v145.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v144.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v49.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v49.h, v21.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v31, v24
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v23.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.h, 0x300, v23.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v48.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v48.h, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v135.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v134.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v31, v25
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v23.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, 0x300, v23.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v133.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v132.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v39.l, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v39.h, v23.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v24.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v26
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v25.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v25.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v38.l, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v38.h, v24.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v130.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v27
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v25.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, 0x300, v25.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v26.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v129.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v128.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v37.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v37.h, v25.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v26.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v31, v28
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v27.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.h, 0x300, v27.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v36.l, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v36.h, v26.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v118.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v31, v29
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v27.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, 0x300, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v117.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v116.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v35.l, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v35.h, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v31, v30
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v31, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v35.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v34.l, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v34.h, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v115.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v114.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v31, v35
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v34.h, 0x300, v29.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v113.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v112.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v33.l, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.h, v33.h, v29.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v31, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v33.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.h, 0x300, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v32.l, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v32.h, v30.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v31, v33
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v32.l
@@ -52915,7 +52089,48 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v32.h, 0x300, v32.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v31.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT:  .LBB38_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:392
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:396
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:400
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:404
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:408
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:412
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:416
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:420
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:424
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:428
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:432
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:436
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:440
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:444
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:448
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:452
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:456
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:460
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:464
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:468
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:472
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:476
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:480
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:484
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:488
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:492
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:496
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:500
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:504
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:508
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:512
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:516
+; GFX11-TRUE16-NEXT:    s_clause 0x4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:520
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:524
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:528
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:532
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:536
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32:
@@ -57181,1887 +56396,946 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:464
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:460
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:456
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:452
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:448
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:444
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:440
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:436
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:432
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:428
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:424
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:420
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:416
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:412
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:408
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:404
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:400
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:396
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:392
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:320
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v2, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v8, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v10, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v16, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v18, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v20, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v22, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v24, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v26, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v28, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v30, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:152
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v113, off, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v116, off, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v117, off, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v128, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v130, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v94, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v95, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v104, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v105, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v106, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v107, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v108, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v109, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v110, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v111, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:260
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v102, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v103, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v112, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v118, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v164, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v166, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v180, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v181, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v182, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v183, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v45, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v46, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v47, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v56, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v62, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(60)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(58)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB39_4
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v53
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v90
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v91
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s11, s26, 0xff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v50
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v49
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v76
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v77
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v39
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v88
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v63
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v72
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v73
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v33
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v74
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v75
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v62
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v57
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v58
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v56
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v47
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v60
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v46
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v45
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v61
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v40
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v41
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v42
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v180
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v43
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v166
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v165
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v167
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v176
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v164
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v163
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v177
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v178
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v162
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v148
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v179
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v149
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v150
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v151
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v118
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v160
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v161
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v112
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v103
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v132
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v133
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v102
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v101
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v134
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v135
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v86
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v144
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v119
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v85
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v84
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v128
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v129
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v130
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v81
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v80
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v113
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v114
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v70
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v115
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v116
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v68
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v117
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v87
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v96
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v97
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v64
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v98
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v99
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s10, 16
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v51
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v52
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v93
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v92
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB39_3
-; GFX11-TRUE16-NEXT:  .LBB39_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v55
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v54
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v52
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v51
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v38
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v34
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v89, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v90, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v91, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v92, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v93, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v88, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v74, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v75, v12
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v3, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v50
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v49
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v48
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v36
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 3, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v46
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 3, v181
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 3, v180
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v76, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v77, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v78, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v79, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v63, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v72, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v73, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v61, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v43, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v44, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v14, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v32
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v62
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v56
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v45
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 3, v183
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 3, v182
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 3, v162
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 3, v145
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 3, v118
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v57, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v58, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v59, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v60, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v40, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v41, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v42, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v179, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v160, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v161, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v19, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v166
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v165
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v164
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 3, v148
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 3, v147
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v146
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 3, v100
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 3, v83
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 3, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v167, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v176, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v177, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v178, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v150, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v151, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v144, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v130, v26
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v131, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v19, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v24, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v112
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v103
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v102
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v101
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 3, v86
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 3, v85
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 3, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 3, v69
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v132, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v133, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v134, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v135, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v119, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v128, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v129, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v117, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v24, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v29, v25
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v81
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v80
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v71
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v70
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 3, v68
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 3, v67
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 3, v66
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 3, v65
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 3, v64
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v113, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v114, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v115, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v116, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v87, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v96, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v97, v30
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v98, v31
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, v99, v32
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v29, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v30
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v32
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:  .LBB39_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:392
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:396
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:400
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:404
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:408
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:412
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:416
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:420
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:424
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:428
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:432
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:460
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:464
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:468
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:472
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:476
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB39_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT:    s_branch .LBB39_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:464
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:460
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:456
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:452
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:448
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:444
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:440
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:436
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:432
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:428
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:424
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:420
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:416
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:412
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:408
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:404
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:400
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:396
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:392
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:388
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v107, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v108, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v109, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v110, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v111, s32 offset:320
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:112
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:128
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:136
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:144
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:152
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:160
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:168
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v114, off, s32 offset:176
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32 offset:184
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:192
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:200
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v128, off, s32 offset:208
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:216
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:224
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:232
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:240
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v95, off, s32 offset:248
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v104, off, s32 offset:256
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v105, off, s32 offset:264
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v106, off, s32 offset:272
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v107, off, s32 offset:280
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v108, off, s32 offset:288
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v109, off, s32 offset:296
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v110, off, s32 offset:304
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v111, off, s32 offset:312
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:308
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:300
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:292
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:284
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:276
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:268
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:260
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:236
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:228
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:220
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:212
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:204
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:196
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:188
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:180
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:172
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:164
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:156
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:148
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v145, off, s32 offset:140
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v146, off, s32 offset:132
-; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v147, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v148, off, s32 offset:116
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v162, off, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v163, off, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v164, off, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:84
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v166, off, s32 offset:76
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:68
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v181, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v45, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v46, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v47, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v56, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v62, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB39_4
-; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v54
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v53
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v90
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v91
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s11, s26, 0xff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v50
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v49
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v76
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v77
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v48
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v39
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v88
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v63
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v36
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v72
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v73
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v33
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v74
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v75
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v62
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v57
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v58
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v56
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v47
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v60
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v46
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v45
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v61
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v40
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v41
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v42
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v180
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v43
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v166
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v165
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v167
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v176
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v164
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v163
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v177
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v178
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v162
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v148
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v179
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v149
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v147
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v150
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v151
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v118
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v160
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v161
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v112
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v103
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v132
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v133
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v102
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v101
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v134
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v135
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v86
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v144
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v119
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v85
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v84
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v128
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v129
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v83
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v82
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v130
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v131
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v81
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v80
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v113
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v114
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v71
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v70
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v115
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v116
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v69
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v68
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v117
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v87
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v67
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v66
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v96
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v97
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v65
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v98
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v99
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v55
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s10, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v51
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v52
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v93
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v92
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v2, v3
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB39_3
-; GFX11-FAKE16-NEXT:  .LBB39_2: ; %cmp.true
-; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v55
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v54
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v52
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v51
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v38
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v34
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v89, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v90, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v92, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v93, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v88, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v74, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v75, v12
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v3, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v50
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v49
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v48
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v39
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v37
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v36
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 3, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v46
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 3, v181
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 3, v180
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v76, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v77, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v78, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v79, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v63, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v72, v8
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v73, v10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v61, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v43, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v44, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v14, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v32
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v62
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v56
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v47
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v45
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 3, v183
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 3, v182
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 3, v162
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 3, v145
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 3, v118
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v57, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v58, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v59, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v60, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v40, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v41, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v42, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v160, v21
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v161, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v18
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v19, v15
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v166
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v165
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v164
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v163
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 3, v148
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 3, v147
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 3, v146
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 3, v100
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 3, v83
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 3, v82
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v167, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v176, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v177, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v178, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v149, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v150, v18
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v151, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v130, v26
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v131, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v23
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v24, v20
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v112
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v103
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v102
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v101
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 3, v86
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 3, v85
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, 3, v84
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 3, v69
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v132, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v133, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v134, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v135, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v119, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v128, v23
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v129, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v117, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v28
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v29, v25
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v81
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v80
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v71
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v70
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 3, v68
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 3, v67
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 3, v66
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 3, v65
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 3, v64
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v113, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v114, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v115, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v116, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v87, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v96, v28
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v97, v30
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v98, v31
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v99, v32
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v34, v30
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v32
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT:  .LBB39_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v108, off, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v107, off, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v106, off, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v105, off, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v104, off, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v95, off, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:388
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:392
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:396
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:400
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:404
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:408
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:412
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:416
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:420
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:424
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:428
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:432
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:460
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:464
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:468
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:472
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:476
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB39_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT:    s_branch .LBB39_2
+; GFX11-LABEL: bitcast_v128i8_to_v32f32_scalar:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:476
+; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:472
+; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:468
+; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:464
+; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:460
+; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:456
+; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:452
+; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:448
+; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:444
+; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:440
+; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:436
+; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:432
+; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:428
+; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:424
+; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:420
+; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:416
+; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:412
+; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:408
+; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:404
+; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:400
+; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:396
+; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:392
+; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:388
+; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:384
+; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:380
+; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:376
+; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:372
+; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:368
+; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:364
+; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:360
+; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:356
+; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:352
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:348
+; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:344
+; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:340
+; GFX11-NEXT:    scratch_store_b32 off, v107, s32 offset:336
+; GFX11-NEXT:    scratch_store_b32 off, v108, s32 offset:332
+; GFX11-NEXT:    scratch_store_b32 off, v109, s32 offset:328
+; GFX11-NEXT:    scratch_store_b32 off, v110, s32 offset:324
+; GFX11-NEXT:    scratch_store_b32 off, v111, s32 offset:320
+; GFX11-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
+; GFX11-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
+; GFX11-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
+; GFX11-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
+; GFX11-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
+; GFX11-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
+; GFX11-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:316
+; GFX11-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u16 v16, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u16 v18, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_u16 v20, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_u16 v22, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_u16 v24, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_u16 v26, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_u16 v28, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_u16 v30, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_u16 v31, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_u16 v87, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:136
+; GFX11-NEXT:    scratch_load_u16 v97, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_u16 v98, off, s32 offset:152
+; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_u16 v113, off, s32 offset:168
+; GFX11-NEXT:    scratch_load_u16 v114, off, s32 offset:176
+; GFX11-NEXT:    scratch_load_u16 v115, off, s32 offset:184
+; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:192
+; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:200
+; GFX11-NEXT:    scratch_load_u16 v128, off, s32 offset:208
+; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:216
+; GFX11-NEXT:    scratch_load_u16 v130, off, s32 offset:224
+; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:232
+; GFX11-NEXT:    scratch_load_u16 v94, off, s32 offset:240
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_u16 v95, off, s32 offset:248
+; GFX11-NEXT:    scratch_load_u16 v104, off, s32 offset:256
+; GFX11-NEXT:    scratch_load_u16 v105, off, s32 offset:264
+; GFX11-NEXT:    scratch_load_u16 v106, off, s32 offset:272
+; GFX11-NEXT:    scratch_load_u16 v107, off, s32 offset:280
+; GFX11-NEXT:    scratch_load_u16 v108, off, s32 offset:288
+; GFX11-NEXT:    scratch_load_u16 v109, off, s32 offset:296
+; GFX11-NEXT:    scratch_load_u16 v110, off, s32 offset:304
+; GFX11-NEXT:    scratch_load_u16 v111, off, s32 offset:312
+; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:308
+; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:300
+; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:292
+; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:284
+; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:276
+; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:268
+; GFX11-NEXT:    scratch_load_u16 v70, off, s32 offset:260
+; GFX11-NEXT:    scratch_load_u16 v71, off, s32 offset:252
+; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:244
+; GFX11-NEXT:    scratch_load_u16 v81, off, s32 offset:236
+; GFX11-NEXT:    scratch_load_u16 v82, off, s32 offset:228
+; GFX11-NEXT:    scratch_load_u16 v83, off, s32 offset:220
+; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:212
+; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:204
+; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:196
+; GFX11-NEXT:    scratch_load_u16 v100, off, s32 offset:188
+; GFX11-NEXT:    scratch_load_u16 v101, off, s32 offset:180
+; GFX11-NEXT:    scratch_load_u16 v102, off, s32 offset:172
+; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:164
+; GFX11-NEXT:    scratch_load_u16 v112, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_u16 v118, off, s32 offset:148
+; GFX11-NEXT:    scratch_load_u16 v145, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_u16 v146, off, s32 offset:132
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_load_u16 v147, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_u16 v148, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_u16 v162, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_u16 v163, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_u16 v164, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_u16 v165, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_u16 v166, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_u16 v180, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_u16 v181, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u16 v182, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_u16 v183, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u16 v45, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u16 v46, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u16 v47, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v56, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v62, off, s32 offset:4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v89, 8, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v90, 8, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v91, 8, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v92, 8, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v93, 8, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v76, 8, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v77, 8, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v78, 8, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v79, 8, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v88, 8, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v63, 8, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v72, 8, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v73, 8, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v74, 8, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v75, 8, v29
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v57, 8, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v58, 8, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v59, 8, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v60, 8, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v61, 8, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v40, 8, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v41, 8, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v42, 8, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v43, 8, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v44, 8, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v167, 8, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v176, 8, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v177, 8, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v178, 8, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v179, 8, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v149, 8, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v150, 8, v87
+; GFX11-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v151, 8, v96
+; GFX11-NEXT:    s_waitcnt vmcnt(60)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v160, 8, v97
+; GFX11-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v161, 8, v98
+; GFX11-NEXT:    s_waitcnt vmcnt(58)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v132, 8, v99
+; GFX11-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v133, 8, v113
+; GFX11-NEXT:    s_waitcnt vmcnt(56)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v134, 8, v114
+; GFX11-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v135, 8, v115
+; GFX11-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v144, 8, v116
+; GFX11-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v119, 8, v117
+; GFX11-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v128, 8, v128
+; GFX11-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v129, 8, v129
+; GFX11-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v130, 8, v130
+; GFX11-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v131, 8, v131
+; GFX11-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v113, 8, v94
+; GFX11-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v114, 8, v95
+; GFX11-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v115, 8, v104
+; GFX11-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v116, 8, v105
+; GFX11-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v117, 8, v106
+; GFX11-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v87, 8, v107
+; GFX11-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v96, 8, v108
+; GFX11-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v97, 8, v109
+; GFX11-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v98, 8, v110
+; GFX11-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v99, 8, v111
+; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cbranch_scc0 .LBB39_4
+; GFX11-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v54
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v53
+; GFX11-NEXT:    s_and_b32 s5, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s29, 8
+; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v90
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v91
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-NEXT:    s_and_b32 s11, s26, 0xff
+; GFX11-NEXT:    v_or_b32_e32 v5, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v50
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v49
+; GFX11-NEXT:    s_lshl_b32 s12, s27, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v76
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v77
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v48
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v39
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v78
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v79
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v8, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v38
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v37
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v88
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v63
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v9, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v36
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v72
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v73
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v10, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v33
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v74
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v75
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v62
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v57
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v58
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v56
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v47
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v59
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v60
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v13, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v46
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v45
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v61
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v40
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v14, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v183
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v182
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v41
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v42
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v15, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v181
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v180
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v43
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v44
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v16, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v166
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v165
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v167
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v176
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v164
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v163
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v177
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v178
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v18, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v162
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v148
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v179
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v149
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v19, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v147
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v146
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v150
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v151
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v20, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v145
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v118
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v160
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v161
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v21, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v112
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v103
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v132
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v133
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v102
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v101
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v134
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v135
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v23, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v100
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v86
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v144
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v119
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v24, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v85
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v84
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v128
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v129
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v25, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v83
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v82
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v130
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v131
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v26, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v81
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v80
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v113
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v114
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v71
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v70
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v115
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v116
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v28, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v69
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v68
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v117
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v87
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v29, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v67
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v66
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v96
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v97
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v30, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v65
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v98
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v99
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v31, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v55
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v89
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX11-NEXT:    s_and_b32 s9, s9, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s10, s10, 16
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v51
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v52
+; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v93
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v92
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v6, v2, v3
+; GFX11-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_vccnz .LBB39_3
+; GFX11-NEXT:  .LBB39_2: ; %cmp.true
+; GFX11-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_and_b32 s1, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s2, s17, 8
+; GFX11-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-NEXT:    s_or_b32 s1, s2, s1
+; GFX11-NEXT:    s_and_b32 s2, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s19, 8
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_and_b32 s3, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s21, 8
+; GFX11-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-NEXT:    s_or_b32 s1, s1, s2
+; GFX11-NEXT:    s_or_b32 s2, s4, s3
+; GFX11-NEXT:    s_and_b32 s3, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s23, 8
+; GFX11-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-NEXT:    s_or_b32 s3, s4, s3
+; GFX11-NEXT:    s_and_b32 s4, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s25, 8
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_addk_i32 s3, 0x300
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT:    s_and_b32 s4, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s27, 8
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v55
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v54
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v52
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v51
+; GFX11-NEXT:    s_or_b32 s3, s3, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v53
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v38
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v34
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v33
+; GFX11-NEXT:    v_or_b32_e32 v0, v89, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v90, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v91, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v92, v3
+; GFX11-NEXT:    v_or_b32_e32 v4, v93, v4
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT:    v_or_b32_e32 v7, v88, v7
+; GFX11-NEXT:    v_or_b32_e32 v11, v74, v11
+; GFX11-NEXT:    v_or_b32_e32 v12, v75, v12
+; GFX11-NEXT:    s_and_b32 s5, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s29, 8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-NEXT:    v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    v_or_b32_e32 v6, v3, v6
+; GFX11-NEXT:    v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v50
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v49
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v48
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v39
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v37
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v36
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 3, v35
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v46
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 3, v181
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 3, v180
+; GFX11-NEXT:    v_or_b32_e32 v0, v76, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v77, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v78, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v79, v3
+; GFX11-NEXT:    v_or_b32_e32 v7, v63, v7
+; GFX11-NEXT:    v_or_b32_e32 v8, v72, v8
+; GFX11-NEXT:    v_or_b32_e32 v10, v73, v10
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x300, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT:    v_or_b32_e32 v12, v61, v12
+; GFX11-NEXT:    v_or_b32_e32 v16, v43, v16
+; GFX11-NEXT:    v_or_b32_e32 v17, v44, v17
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v8, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v9, v9, v13
+; GFX11-NEXT:    v_or_b32_e32 v10, v14, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v62
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v56
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v47
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v45
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 3, v183
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 3, v182
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT:    v_or_b32_e32 v16, v16, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 3, v162
+; GFX11-NEXT:    v_add_nc_u32_e32 v21, 3, v145
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 3, v118
+; GFX11-NEXT:    v_or_b32_e32 v0, v57, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v58, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v59, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v60, v3
+; GFX11-NEXT:    v_or_b32_e32 v12, v40, v12
+; GFX11-NEXT:    v_or_b32_e32 v13, v41, v13
+; GFX11-NEXT:    v_or_b32_e32 v15, v42, v15
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x300, v13
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT:    v_or_b32_e32 v17, v179, v17
+; GFX11-NEXT:    v_or_b32_e32 v21, v160, v21
+; GFX11-NEXT:    v_or_b32_e32 v22, v161, v22
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x300, v21
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT:    v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v13, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v14, v14, v18
+; GFX11-NEXT:    v_or_b32_e32 v15, v19, v15
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v166
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v165
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v164
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v163
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 3, v148
+; GFX11-NEXT:    v_add_nc_u32_e32 v18, 3, v147
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 3, v146
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-NEXT:    v_or_b32_e32 v21, v21, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 3, v100
+; GFX11-NEXT:    v_add_nc_u32_e32 v26, 3, v83
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 3, v82
+; GFX11-NEXT:    v_or_b32_e32 v0, v167, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v176, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v177, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v178, v3
+; GFX11-NEXT:    v_or_b32_e32 v17, v149, v17
+; GFX11-NEXT:    v_or_b32_e32 v18, v150, v18
+; GFX11-NEXT:    v_or_b32_e32 v20, v151, v20
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x300, v18
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x300, v20
+; GFX11-NEXT:    v_or_b32_e32 v22, v144, v22
+; GFX11-NEXT:    v_or_b32_e32 v26, v130, v26
+; GFX11-NEXT:    v_or_b32_e32 v27, v131, v27
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v26, 0x300, v26
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT:    v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v18, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v19, v19, v23
+; GFX11-NEXT:    v_or_b32_e32 v20, v24, v20
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v112
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v103
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v102
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v101
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 3, v86
+; GFX11-NEXT:    v_add_nc_u32_e32 v23, 3, v85
+; GFX11-NEXT:    v_add_nc_u32_e32 v25, 3, v84
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v23
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-NEXT:    v_or_b32_e32 v26, v26, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 3, v69
+; GFX11-NEXT:    v_or_b32_e32 v0, v132, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v133, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v134, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v135, v3
+; GFX11-NEXT:    v_or_b32_e32 v22, v119, v22
+; GFX11-NEXT:    v_or_b32_e32 v23, v128, v23
+; GFX11-NEXT:    v_or_b32_e32 v25, v129, v25
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x300, v23
+; GFX11-NEXT:    v_add_nc_u32_e32 v25, 0x300, v25
+; GFX11-NEXT:    v_or_b32_e32 v27, v117, v27
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT:    v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v23, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v24, v24, v28
+; GFX11-NEXT:    v_or_b32_e32 v25, v29, v25
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v81
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v80
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v71
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v70
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 3, v68
+; GFX11-NEXT:    v_add_nc_u32_e32 v28, 3, v67
+; GFX11-NEXT:    v_add_nc_u32_e32 v30, 3, v66
+; GFX11-NEXT:    v_add_nc_u32_e32 v31, 3, v65
+; GFX11-NEXT:    v_add_nc_u32_e32 v32, 3, v64
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v31
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v32
+; GFX11-NEXT:    v_or_b32_e32 v0, v113, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v114, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v115, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v116, v3
+; GFX11-NEXT:    v_or_b32_e32 v27, v87, v27
+; GFX11-NEXT:    v_or_b32_e32 v28, v96, v28
+; GFX11-NEXT:    v_or_b32_e32 v30, v97, v30
+; GFX11-NEXT:    v_or_b32_e32 v31, v98, v31
+; GFX11-NEXT:    v_or_b32_e32 v32, v99, v32
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x300, v28
+; GFX11-NEXT:    v_add_nc_u32_e32 v30, 0x300, v30
+; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x300, v31
+; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x300, v32
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-NEXT:    v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v28, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v29, v29, v33
+; GFX11-NEXT:    v_or_b32_e32 v30, v34, v30
+; GFX11-NEXT:    v_or_b32_e32 v31, v31, v32
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:  .LBB39_3: ; %end
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:320
+; GFX11-NEXT:    scratch_load_b32 v110, off, s32 offset:324
+; GFX11-NEXT:    scratch_load_b32 v109, off, s32 offset:328
+; GFX11-NEXT:    scratch_load_b32 v108, off, s32 offset:332
+; GFX11-NEXT:    scratch_load_b32 v107, off, s32 offset:336
+; GFX11-NEXT:    scratch_load_b32 v106, off, s32 offset:340
+; GFX11-NEXT:    scratch_load_b32 v105, off, s32 offset:344
+; GFX11-NEXT:    scratch_load_b32 v104, off, s32 offset:348
+; GFX11-NEXT:    scratch_load_b32 v95, off, s32 offset:352
+; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:356
+; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:360
+; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:364
+; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:368
+; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:372
+; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:376
+; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:380
+; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:384
+; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:388
+; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:392
+; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:396
+; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:400
+; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:404
+; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:408
+; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:412
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:416
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:420
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:424
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:428
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:432
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:436
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:440
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:444
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:448
+; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:452
+; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:456
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:460
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:464
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:468
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:472
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:476
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-NEXT:  .LBB39_4:
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-NEXT:    s_branch .LBB39_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -86760,876 +85034,913 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:536
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:532
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:528
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:524
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:520
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:516
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:512
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:508
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:504
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:500
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:496
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:492
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:488
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:484
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:480
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:476
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:472
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:468
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:464
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:460
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:456
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:452
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:448
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:444
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:440
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:436
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:432
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:428
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:424
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:420
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:416
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:412
+; GFX11-TRUE16-NEXT:    s_clause 0x4
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:408
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:404
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:400
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:396
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:392
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v102, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v160, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v160, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v161, off, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v112, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v32, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v113, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v56, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v114, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v33, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v115, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v57, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v116, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v34, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v117, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v58, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v118, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v35, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v119, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v59, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v128, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v36, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v129, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v60, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v130, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v37, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v131, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v61, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v132, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v38, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v133, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v62, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v134, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v39, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v135, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v162, off, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v163, off, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v164, off, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v144, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v145, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v146, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v49, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v147, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v73, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v148, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v50, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:388
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v52, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v53, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v54, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v55, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v64, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v65, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v66, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v67, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v74, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v75, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v76, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v77, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v78, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v79, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v88, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v89, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v90, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v91, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v92, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v93, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v95, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v106, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v107, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v108, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v149, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v150, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v151, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v160, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v163, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v164, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v165, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v166, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v167, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v176, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v177, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v178, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v179, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v180, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v181, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v182, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v183, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v40, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v41, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v42, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v43, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v44, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v45, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.h, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v46, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v47, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v84.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.l, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v56.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v57.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v58.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v36.h, 8, v59.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v36.l, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.h, 8, v60.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.h, 8, v61.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.l, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v62.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v63.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v48.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v72.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v49.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v73.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v50.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v51
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v81.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.l, 8, v52.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v53.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v54.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v83.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.l, 8, v55.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.h, 8, v64.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v85.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.h, 8, v65.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v66.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v87.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v67.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v74.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v97.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.l, 8, v75.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v98.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.h, 8, v76.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v77.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v78.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.l, 8, v79.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v102.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.h, 8, v88.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v89.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v160.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v90.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v91.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v161.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v92.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v93.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v162.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v94.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v95.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v104.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v164.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v105.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v164.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v106.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v165.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v107.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v31.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v108.l
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB58_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
-; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB58_4
-; GFX11-TRUE16-NEXT:  .LBB58_2: ; %end
-; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB58_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v1.h, v150.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v117.l
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB58_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v101.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.h, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v0.l, v103.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v103.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v102.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v101, v0
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v1.h, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v100.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v2.l, v100.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v97.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v101, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v3.l, v99.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v99.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v87.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v101, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v4.l, v96.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v85.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v69.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v15.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v82.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v71.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v68.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v18.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v66.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v54.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v21.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v53.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v47.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v46.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v101, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v5.l, v87.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v81.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v45.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v44.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v101, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v6.l, v85.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v43.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v42.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v41.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v101, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v7.l, v83.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v40.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v183.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v182.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v101, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v8.l, v81.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v68.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v67.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v181.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v180.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v101, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v9.l, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v179.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v66.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v178.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v177.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v101, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v10.l, v69.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v65.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v176.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v167.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v166.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v101, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v11.l, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v55.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v165.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v164.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v101, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v12.l, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v163.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v162.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v101, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v13.l, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v160.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v101, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v14.l, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v51.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v149.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v148.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v101, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v15.l, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v147.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v50.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v146.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v101, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v16.l, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v49.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v144.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v101, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v17.l, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v39.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v133.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v132.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v101, v17
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v18.l, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v101, v18
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v19.l, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v128.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v101, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v20.l, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v116.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v101, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v21.l, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v101, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v22.l, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v112.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v101, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v23.l, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v32.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v24.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v101, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v24.l, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v101, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v25.l, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v101, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v26.l, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v101, v26
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v27.l, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v28.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v101, v27
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v28.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v101, v28
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v29.l, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v31.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v101, v29
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v30.l, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v101, v30
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v31.l, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v101, v31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:  .LBB58_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB58_2
-; GFX11-TRUE16-NEXT:  .LBB58_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB58_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v101.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v101.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v98.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v98.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v145.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v103.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v102.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v102.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v97.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v96.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v134.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v86.h, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v31, v3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v86.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v148.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v100.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v148.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v100.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v31, v4
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v31.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v99.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v99.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v84.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v83.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v31, v5
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v130.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v82.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v82.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v96.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v97.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v31, v6
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v87.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v87.h, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v71.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v71.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v31, v7
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v117.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v70.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v69.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v85.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v85.h, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v31, v8
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v132.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v83.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v84.l, v6.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v47.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v46.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v31, v9
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v112.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v45.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v44.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v81.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v81.h, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v31, v10
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v9.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v103.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v80.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v80.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v43.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v42.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v31, v11
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v41.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v40.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v69.h, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v70.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v31, v12
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v68.l, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v68.h, v10.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v183.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v182.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v31, v13
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v96.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v181.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v180.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v67.l, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v67.h, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v31, v14
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v86.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v66.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v66.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v179.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v178.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v31, v15
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v84.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v177.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v176.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v65.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v65.h, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v31, v16
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, 0x300, v15.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v64.l, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v64.h, v14.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v167.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v166.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v31, v17
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v15.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v80.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v165.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v164.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v55.l, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v55.h, v15.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v16.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v31, v18
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v17.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.h, 0x300, v17.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v54.l, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v54.h, v16.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v163.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v162.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v31, v19
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v17.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, 0x300, v17.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v18.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v161.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v160.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v53.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v53.h, v17.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v31, v20
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v19.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v19.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v52.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v52.h, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v151.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v150.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v31, v21
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v19.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, 0x300, v19.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v20.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v149.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v148.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v51.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v51.h, v19.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v20.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v31, v22
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v21.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, 0x300, v21.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v50.l, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v50.h, v20.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v147.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v146.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v31, v23
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v21.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, 0x300, v21.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v22.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v145.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v144.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v49.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v49.h, v21.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v31, v24
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v23.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.h, 0x300, v23.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v48.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v48.h, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v135.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v134.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v31, v25
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v23.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, 0x300, v23.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v133.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v132.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v39.l, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v39.h, v23.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v24.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v26
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v25.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v25.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v38.l, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v38.h, v24.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v130.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v27
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v25.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, 0x300, v25.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v26.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v129.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v128.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v37.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v37.h, v25.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v26.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v31, v28
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v27.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.h, 0x300, v27.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v36.l, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v36.h, v26.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v118.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v31, v29
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v27.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, 0x300, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v117.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v116.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v35.l, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v35.h, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v31, v30
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v31, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v35.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v34.l, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v34.h, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v115.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v114.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v31, v35
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v34.h, 0x300, v29.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v113.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v112.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v33.l, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.h, v33.h, v29.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v31, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v33.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.h, 0x300, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v32.l, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v32.h, v30.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v31, v33
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v32.l
@@ -87637,7 +85948,48 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v32.h, 0x300, v32.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v31.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT:  .LBB58_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:392
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:396
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:400
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:404
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:408
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:412
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:416
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:420
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:424
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:428
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:432
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:436
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:440
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:444
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:448
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:452
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:456
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:460
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:464
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:468
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:472
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:476
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:480
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:484
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:488
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:492
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:496
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:500
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:504
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:508
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:512
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:516
+; GFX11-TRUE16-NEXT:    s_clause 0x4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:520
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:524
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:528
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:532
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:536
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64:
@@ -91903,1887 +90255,946 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:464
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:460
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:456
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:452
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:448
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:444
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:440
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:436
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:432
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:428
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:424
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:420
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:416
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:412
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:408
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:404
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:400
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:396
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:392
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:320
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v2, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v8, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v10, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v16, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v18, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v20, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v22, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v24, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v26, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v28, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v30, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:152
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v113, off, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v116, off, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v117, off, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v128, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v130, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v94, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v95, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v104, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v105, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v106, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v107, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v108, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v109, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v110, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v111, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:260
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v102, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v103, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v112, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v118, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v164, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v166, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v180, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v181, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v182, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v183, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v45, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v46, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v47, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v56, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v62, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(60)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(58)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB59_4
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v53
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v90
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v91
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s11, s26, 0xff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v50
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v49
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v76
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v77
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v39
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v88
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v63
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v72
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v73
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v33
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v74
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v75
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v62
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v57
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v58
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v56
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v47
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v60
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v46
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v45
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v61
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v40
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v41
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v42
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v180
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v43
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v166
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v165
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v167
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v176
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v164
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v163
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v177
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v178
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v162
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v148
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v179
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v149
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v150
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v151
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v118
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v160
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v161
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v112
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v103
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v132
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v133
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v102
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v101
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v134
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v135
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v86
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v144
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v119
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v85
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v84
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v128
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v129
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v130
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v81
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v80
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v113
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v114
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v70
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v115
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v116
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v68
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v117
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v87
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v96
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v97
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v64
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v98
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v99
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s10, 16
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v51
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v52
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v93
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v92
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB59_3
-; GFX11-TRUE16-NEXT:  .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v55
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v54
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v52
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v51
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v38
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v34
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v89, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v90, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v91, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v92, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v93, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v88, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v74, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v75, v12
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v3, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v50
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v49
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v48
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v36
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 3, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v46
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 3, v181
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 3, v180
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v76, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v77, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v78, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v79, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v63, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v72, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v73, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v61, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v43, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v44, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v14, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v32
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v62
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v56
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v45
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 3, v183
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 3, v182
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 3, v162
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 3, v145
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 3, v118
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v57, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v58, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v59, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v60, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v40, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v41, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v42, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v179, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v160, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v161, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v19, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v166
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v165
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v164
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 3, v148
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 3, v147
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v146
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 3, v100
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 3, v83
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 3, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v167, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v176, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v177, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v178, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v150, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v151, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v144, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v130, v26
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v131, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v19, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v24, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v112
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v103
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v102
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v101
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 3, v86
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 3, v85
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 3, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 3, v69
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v132, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v133, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v134, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v135, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v119, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v128, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v129, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v117, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v24, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v29, v25
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v81
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v80
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v71
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v70
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 3, v68
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 3, v67
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 3, v66
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 3, v65
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 3, v64
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v113, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v114, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v115, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v116, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v87, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v96, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v97, v30
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v98, v31
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, v99, v32
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v29, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v30
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v32
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:  .LBB59_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:392
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:396
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:400
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:404
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:408
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:412
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:416
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:420
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:424
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:428
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:432
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:460
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:464
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:468
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:472
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:476
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB59_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT:    s_branch .LBB59_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:464
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:460
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:456
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:452
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:448
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:444
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:440
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:436
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:432
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:428
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:424
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:420
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:416
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:412
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:408
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:404
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:400
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:396
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:392
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:388
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v107, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v108, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v109, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v110, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v111, s32 offset:320
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:112
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:128
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:136
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:144
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:152
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:160
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:168
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v114, off, s32 offset:176
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32 offset:184
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:192
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:200
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v128, off, s32 offset:208
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:216
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:224
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:232
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:240
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v95, off, s32 offset:248
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v104, off, s32 offset:256
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v105, off, s32 offset:264
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v106, off, s32 offset:272
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v107, off, s32 offset:280
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v108, off, s32 offset:288
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v109, off, s32 offset:296
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v110, off, s32 offset:304
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v111, off, s32 offset:312
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:308
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:300
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:292
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:284
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:276
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:268
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:260
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:236
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:228
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:220
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:212
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:204
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:196
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:188
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:180
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:172
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:164
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:156
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:148
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v145, off, s32 offset:140
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v146, off, s32 offset:132
-; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v147, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v148, off, s32 offset:116
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v162, off, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v163, off, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v164, off, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:84
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v166, off, s32 offset:76
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:68
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v181, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v45, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v46, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v47, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v56, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v62, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB59_4
-; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v54
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v53
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v90
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v91
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s11, s26, 0xff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v50
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v49
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v76
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v77
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v48
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v39
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v88
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v63
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v36
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v72
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v73
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v33
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v74
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v75
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v62
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v57
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v58
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v56
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v47
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v60
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v46
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v45
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v61
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v40
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v41
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v42
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v180
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v43
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v166
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v165
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v167
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v176
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v164
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v163
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v177
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v178
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v162
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v148
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v179
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v149
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v147
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v150
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v151
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v118
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v160
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v161
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v112
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v103
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v132
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v133
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v102
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v101
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v134
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v135
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v86
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v144
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v119
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v85
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v84
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v128
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v129
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v83
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v82
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v130
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v131
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v81
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v80
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v113
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v114
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v71
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v70
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v115
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v116
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v69
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v68
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v117
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v87
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v67
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v66
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v96
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v97
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v65
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v98
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v99
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v55
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s10, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v51
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v52
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v93
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v92
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v2, v3
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB59_3
-; GFX11-FAKE16-NEXT:  .LBB59_2: ; %cmp.true
-; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v55
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v54
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v52
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v51
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v38
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v34
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v89, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v90, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v92, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v93, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v88, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v74, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v75, v12
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v3, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v50
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v49
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v48
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v39
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v37
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v36
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 3, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v46
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 3, v181
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 3, v180
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v76, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v77, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v78, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v79, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v63, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v72, v8
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v73, v10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v61, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v43, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v44, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v14, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v32
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v62
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v56
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v47
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v45
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 3, v183
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 3, v182
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 3, v162
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 3, v145
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 3, v118
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v57, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v58, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v59, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v60, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v40, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v41, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v42, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v160, v21
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v161, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v18
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v19, v15
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v166
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v165
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v164
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v163
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 3, v148
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 3, v147
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 3, v146
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 3, v100
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 3, v83
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 3, v82
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v167, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v176, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v177, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v178, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v149, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v150, v18
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v151, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v130, v26
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v131, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v23
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v24, v20
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v112
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v103
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v102
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v101
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 3, v86
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 3, v85
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, 3, v84
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 3, v69
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v132, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v133, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v134, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v135, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v119, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v128, v23
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v129, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v117, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v28
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v29, v25
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v81
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v80
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v71
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v70
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 3, v68
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 3, v67
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 3, v66
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 3, v65
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 3, v64
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v113, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v114, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v115, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v116, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v87, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v96, v28
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v97, v30
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v98, v31
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v99, v32
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v34, v30
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v32
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT:  .LBB59_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v108, off, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v107, off, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v106, off, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v105, off, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v104, off, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v95, off, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:388
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:392
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:396
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:400
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:404
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:408
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:412
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:416
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:420
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:424
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:428
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:432
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:460
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:464
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:468
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:472
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:476
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB59_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT:    s_branch .LBB59_2
+; GFX11-LABEL: bitcast_v128i8_to_v16i64_scalar:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:476
+; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:472
+; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:468
+; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:464
+; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:460
+; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:456
+; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:452
+; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:448
+; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:444
+; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:440
+; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:436
+; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:432
+; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:428
+; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:424
+; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:420
+; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:416
+; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:412
+; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:408
+; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:404
+; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:400
+; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:396
+; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:392
+; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:388
+; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:384
+; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:380
+; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:376
+; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:372
+; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:368
+; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:364
+; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:360
+; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:356
+; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:352
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:348
+; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:344
+; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:340
+; GFX11-NEXT:    scratch_store_b32 off, v107, s32 offset:336
+; GFX11-NEXT:    scratch_store_b32 off, v108, s32 offset:332
+; GFX11-NEXT:    scratch_store_b32 off, v109, s32 offset:328
+; GFX11-NEXT:    scratch_store_b32 off, v110, s32 offset:324
+; GFX11-NEXT:    scratch_store_b32 off, v111, s32 offset:320
+; GFX11-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
+; GFX11-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
+; GFX11-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
+; GFX11-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
+; GFX11-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
+; GFX11-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
+; GFX11-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:316
+; GFX11-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u16 v16, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u16 v18, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_u16 v20, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_u16 v22, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_u16 v24, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_u16 v26, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_u16 v28, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_u16 v30, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_u16 v31, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_u16 v87, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:136
+; GFX11-NEXT:    scratch_load_u16 v97, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_u16 v98, off, s32 offset:152
+; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_u16 v113, off, s32 offset:168
+; GFX11-NEXT:    scratch_load_u16 v114, off, s32 offset:176
+; GFX11-NEXT:    scratch_load_u16 v115, off, s32 offset:184
+; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:192
+; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:200
+; GFX11-NEXT:    scratch_load_u16 v128, off, s32 offset:208
+; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:216
+; GFX11-NEXT:    scratch_load_u16 v130, off, s32 offset:224
+; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:232
+; GFX11-NEXT:    scratch_load_u16 v94, off, s32 offset:240
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_u16 v95, off, s32 offset:248
+; GFX11-NEXT:    scratch_load_u16 v104, off, s32 offset:256
+; GFX11-NEXT:    scratch_load_u16 v105, off, s32 offset:264
+; GFX11-NEXT:    scratch_load_u16 v106, off, s32 offset:272
+; GFX11-NEXT:    scratch_load_u16 v107, off, s32 offset:280
+; GFX11-NEXT:    scratch_load_u16 v108, off, s32 offset:288
+; GFX11-NEXT:    scratch_load_u16 v109, off, s32 offset:296
+; GFX11-NEXT:    scratch_load_u16 v110, off, s32 offset:304
+; GFX11-NEXT:    scratch_load_u16 v111, off, s32 offset:312
+; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:308
+; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:300
+; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:292
+; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:284
+; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:276
+; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:268
+; GFX11-NEXT:    scratch_load_u16 v70, off, s32 offset:260
+; GFX11-NEXT:    scratch_load_u16 v71, off, s32 offset:252
+; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:244
+; GFX11-NEXT:    scratch_load_u16 v81, off, s32 offset:236
+; GFX11-NEXT:    scratch_load_u16 v82, off, s32 offset:228
+; GFX11-NEXT:    scratch_load_u16 v83, off, s32 offset:220
+; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:212
+; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:204
+; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:196
+; GFX11-NEXT:    scratch_load_u16 v100, off, s32 offset:188
+; GFX11-NEXT:    scratch_load_u16 v101, off, s32 offset:180
+; GFX11-NEXT:    scratch_load_u16 v102, off, s32 offset:172
+; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:164
+; GFX11-NEXT:    scratch_load_u16 v112, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_u16 v118, off, s32 offset:148
+; GFX11-NEXT:    scratch_load_u16 v145, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_u16 v146, off, s32 offset:132
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_load_u16 v147, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_u16 v148, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_u16 v162, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_u16 v163, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_u16 v164, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_u16 v165, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_u16 v166, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_u16 v180, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_u16 v181, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u16 v182, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_u16 v183, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u16 v45, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u16 v46, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u16 v47, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v56, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v62, off, s32 offset:4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v89, 8, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v90, 8, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v91, 8, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v92, 8, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v93, 8, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v76, 8, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v77, 8, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v78, 8, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v79, 8, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v88, 8, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v63, 8, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v72, 8, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v73, 8, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v74, 8, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v75, 8, v29
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v57, 8, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v58, 8, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v59, 8, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v60, 8, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v61, 8, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v40, 8, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v41, 8, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v42, 8, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v43, 8, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v44, 8, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v167, 8, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v176, 8, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v177, 8, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v178, 8, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v179, 8, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v149, 8, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v150, 8, v87
+; GFX11-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v151, 8, v96
+; GFX11-NEXT:    s_waitcnt vmcnt(60)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v160, 8, v97
+; GFX11-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v161, 8, v98
+; GFX11-NEXT:    s_waitcnt vmcnt(58)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v132, 8, v99
+; GFX11-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v133, 8, v113
+; GFX11-NEXT:    s_waitcnt vmcnt(56)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v134, 8, v114
+; GFX11-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v135, 8, v115
+; GFX11-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v144, 8, v116
+; GFX11-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v119, 8, v117
+; GFX11-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v128, 8, v128
+; GFX11-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v129, 8, v129
+; GFX11-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v130, 8, v130
+; GFX11-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v131, 8, v131
+; GFX11-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v113, 8, v94
+; GFX11-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v114, 8, v95
+; GFX11-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v115, 8, v104
+; GFX11-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v116, 8, v105
+; GFX11-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v117, 8, v106
+; GFX11-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v87, 8, v107
+; GFX11-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v96, 8, v108
+; GFX11-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v97, 8, v109
+; GFX11-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v98, 8, v110
+; GFX11-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v99, 8, v111
+; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cbranch_scc0 .LBB59_4
+; GFX11-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v54
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v53
+; GFX11-NEXT:    s_and_b32 s5, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s29, 8
+; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v90
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v91
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-NEXT:    s_and_b32 s11, s26, 0xff
+; GFX11-NEXT:    v_or_b32_e32 v5, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v50
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v49
+; GFX11-NEXT:    s_lshl_b32 s12, s27, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v76
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v77
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v48
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v39
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v78
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v79
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v8, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v38
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v37
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v88
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v63
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v9, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v36
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v72
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v73
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v10, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v33
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v74
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v75
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v62
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v57
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v58
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v56
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v47
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v59
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v60
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v13, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v46
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v45
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v61
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v40
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v14, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v183
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v182
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v41
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v42
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v15, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v181
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v180
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v43
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v44
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v16, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v166
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v165
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v167
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v176
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v164
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v163
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v177
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v178
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v18, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v162
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v148
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v179
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v149
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v19, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v147
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v146
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v150
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v151
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v20, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v145
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v118
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v160
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v161
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v21, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v112
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v103
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v132
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v133
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v102
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v101
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v134
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v135
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v23, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v100
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v86
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v144
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v119
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v24, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v85
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v84
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v128
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v129
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v25, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v83
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v82
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v130
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v131
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v26, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v81
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v80
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v113
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v114
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v71
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v70
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v115
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v116
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v28, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v69
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v68
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v117
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v87
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v29, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v67
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v66
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v96
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v97
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v30, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v65
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v98
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v99
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v31, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v55
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v89
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX11-NEXT:    s_and_b32 s9, s9, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s10, s10, 16
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v51
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v52
+; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v93
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v92
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v6, v2, v3
+; GFX11-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_vccnz .LBB59_3
+; GFX11-NEXT:  .LBB59_2: ; %cmp.true
+; GFX11-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_and_b32 s1, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s2, s17, 8
+; GFX11-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-NEXT:    s_or_b32 s1, s2, s1
+; GFX11-NEXT:    s_and_b32 s2, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s19, 8
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_and_b32 s3, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s21, 8
+; GFX11-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-NEXT:    s_or_b32 s1, s1, s2
+; GFX11-NEXT:    s_or_b32 s2, s4, s3
+; GFX11-NEXT:    s_and_b32 s3, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s23, 8
+; GFX11-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-NEXT:    s_or_b32 s3, s4, s3
+; GFX11-NEXT:    s_and_b32 s4, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s25, 8
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_addk_i32 s3, 0x300
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT:    s_and_b32 s4, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s27, 8
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v55
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v54
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v52
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v51
+; GFX11-NEXT:    s_or_b32 s3, s3, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v53
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v38
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v34
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v33
+; GFX11-NEXT:    v_or_b32_e32 v0, v89, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v90, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v91, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v92, v3
+; GFX11-NEXT:    v_or_b32_e32 v4, v93, v4
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT:    v_or_b32_e32 v7, v88, v7
+; GFX11-NEXT:    v_or_b32_e32 v11, v74, v11
+; GFX11-NEXT:    v_or_b32_e32 v12, v75, v12
+; GFX11-NEXT:    s_and_b32 s5, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s29, 8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-NEXT:    v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    v_or_b32_e32 v6, v3, v6
+; GFX11-NEXT:    v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v50
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v49
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v48
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v39
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v37
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v36
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 3, v35
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v46
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 3, v181
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 3, v180
+; GFX11-NEXT:    v_or_b32_e32 v0, v76, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v77, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v78, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v79, v3
+; GFX11-NEXT:    v_or_b32_e32 v7, v63, v7
+; GFX11-NEXT:    v_or_b32_e32 v8, v72, v8
+; GFX11-NEXT:    v_or_b32_e32 v10, v73, v10
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x300, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT:    v_or_b32_e32 v12, v61, v12
+; GFX11-NEXT:    v_or_b32_e32 v16, v43, v16
+; GFX11-NEXT:    v_or_b32_e32 v17, v44, v17
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v8, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v9, v9, v13
+; GFX11-NEXT:    v_or_b32_e32 v10, v14, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v62
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v56
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v47
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v45
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 3, v183
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 3, v182
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT:    v_or_b32_e32 v16, v16, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 3, v162
+; GFX11-NEXT:    v_add_nc_u32_e32 v21, 3, v145
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 3, v118
+; GFX11-NEXT:    v_or_b32_e32 v0, v57, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v58, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v59, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v60, v3
+; GFX11-NEXT:    v_or_b32_e32 v12, v40, v12
+; GFX11-NEXT:    v_or_b32_e32 v13, v41, v13
+; GFX11-NEXT:    v_or_b32_e32 v15, v42, v15
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x300, v13
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT:    v_or_b32_e32 v17, v179, v17
+; GFX11-NEXT:    v_or_b32_e32 v21, v160, v21
+; GFX11-NEXT:    v_or_b32_e32 v22, v161, v22
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x300, v21
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT:    v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v13, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v14, v14, v18
+; GFX11-NEXT:    v_or_b32_e32 v15, v19, v15
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v166
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v165
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v164
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v163
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 3, v148
+; GFX11-NEXT:    v_add_nc_u32_e32 v18, 3, v147
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 3, v146
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-NEXT:    v_or_b32_e32 v21, v21, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 3, v100
+; GFX11-NEXT:    v_add_nc_u32_e32 v26, 3, v83
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 3, v82
+; GFX11-NEXT:    v_or_b32_e32 v0, v167, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v176, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v177, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v178, v3
+; GFX11-NEXT:    v_or_b32_e32 v17, v149, v17
+; GFX11-NEXT:    v_or_b32_e32 v18, v150, v18
+; GFX11-NEXT:    v_or_b32_e32 v20, v151, v20
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x300, v18
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x300, v20
+; GFX11-NEXT:    v_or_b32_e32 v22, v144, v22
+; GFX11-NEXT:    v_or_b32_e32 v26, v130, v26
+; GFX11-NEXT:    v_or_b32_e32 v27, v131, v27
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v26, 0x300, v26
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT:    v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v18, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v19, v19, v23
+; GFX11-NEXT:    v_or_b32_e32 v20, v24, v20
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v112
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v103
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v102
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v101
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 3, v86
+; GFX11-NEXT:    v_add_nc_u32_e32 v23, 3, v85
+; GFX11-NEXT:    v_add_nc_u32_e32 v25, 3, v84
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v23
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-NEXT:    v_or_b32_e32 v26, v26, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 3, v69
+; GFX11-NEXT:    v_or_b32_e32 v0, v132, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v133, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v134, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v135, v3
+; GFX11-NEXT:    v_or_b32_e32 v22, v119, v22
+; GFX11-NEXT:    v_or_b32_e32 v23, v128, v23
+; GFX11-NEXT:    v_or_b32_e32 v25, v129, v25
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x300, v23
+; GFX11-NEXT:    v_add_nc_u32_e32 v25, 0x300, v25
+; GFX11-NEXT:    v_or_b32_e32 v27, v117, v27
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT:    v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v23, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v24, v24, v28
+; GFX11-NEXT:    v_or_b32_e32 v25, v29, v25
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v81
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v80
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v71
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v70
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 3, v68
+; GFX11-NEXT:    v_add_nc_u32_e32 v28, 3, v67
+; GFX11-NEXT:    v_add_nc_u32_e32 v30, 3, v66
+; GFX11-NEXT:    v_add_nc_u32_e32 v31, 3, v65
+; GFX11-NEXT:    v_add_nc_u32_e32 v32, 3, v64
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v31
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v32
+; GFX11-NEXT:    v_or_b32_e32 v0, v113, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v114, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v115, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v116, v3
+; GFX11-NEXT:    v_or_b32_e32 v27, v87, v27
+; GFX11-NEXT:    v_or_b32_e32 v28, v96, v28
+; GFX11-NEXT:    v_or_b32_e32 v30, v97, v30
+; GFX11-NEXT:    v_or_b32_e32 v31, v98, v31
+; GFX11-NEXT:    v_or_b32_e32 v32, v99, v32
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x300, v28
+; GFX11-NEXT:    v_add_nc_u32_e32 v30, 0x300, v30
+; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x300, v31
+; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x300, v32
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-NEXT:    v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v28, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v29, v29, v33
+; GFX11-NEXT:    v_or_b32_e32 v30, v34, v30
+; GFX11-NEXT:    v_or_b32_e32 v31, v31, v32
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:  .LBB59_3: ; %end
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:320
+; GFX11-NEXT:    scratch_load_b32 v110, off, s32 offset:324
+; GFX11-NEXT:    scratch_load_b32 v109, off, s32 offset:328
+; GFX11-NEXT:    scratch_load_b32 v108, off, s32 offset:332
+; GFX11-NEXT:    scratch_load_b32 v107, off, s32 offset:336
+; GFX11-NEXT:    scratch_load_b32 v106, off, s32 offset:340
+; GFX11-NEXT:    scratch_load_b32 v105, off, s32 offset:344
+; GFX11-NEXT:    scratch_load_b32 v104, off, s32 offset:348
+; GFX11-NEXT:    scratch_load_b32 v95, off, s32 offset:352
+; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:356
+; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:360
+; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:364
+; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:368
+; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:372
+; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:376
+; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:380
+; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:384
+; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:388
+; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:392
+; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:396
+; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:400
+; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:404
+; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:408
+; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:412
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:416
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:420
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:424
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:428
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:432
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:436
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:440
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:444
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:448
+; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:452
+; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:456
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:460
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:464
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:468
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:472
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:476
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-NEXT:  .LBB59_4:
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-NEXT:    s_branch .LBB59_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -121411,876 +118822,913 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:536
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:532
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:528
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:524
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:520
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:516
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:512
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:508
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:504
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:500
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:496
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:492
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:488
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:484
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:480
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:476
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:472
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:468
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:464
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:460
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:456
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:452
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:448
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:444
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:440
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:436
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:432
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:428
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:424
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:420
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:416
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:412
+; GFX11-TRUE16-NEXT:    s_clause 0x4
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:408
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:404
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:400
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:396
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:392
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v102, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v160, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v160, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v161, off, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v112, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v32, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v113, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v56, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v114, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v33, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v115, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v57, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v116, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v34, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v117, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v58, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v118, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v35, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v119, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v59, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v128, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v36, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v129, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v60, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v130, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v37, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v131, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v61, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v132, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v38, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v133, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v62, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v134, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v39, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v135, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v162, off, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v163, off, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v164, off, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v144, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v145, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v146, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v49, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v147, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v73, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v148, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v50, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:388
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v52, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v53, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v54, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v55, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v64, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v65, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v66, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v67, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v74, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v75, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v76, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v77, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v78, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v79, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v88, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v89, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v90, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v91, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v92, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v93, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v95, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v106, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v107, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v108, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v149, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v150, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v151, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v160, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v163, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v164, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v165, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v166, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v167, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v176, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v177, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v178, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v179, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v180, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v181, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v182, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v183, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v40, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v41, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v42, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v43, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v44, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v45, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.h, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v146.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v46, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v47, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v84.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.l, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v56.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v57.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v58.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v36.h, 8, v59.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v36.l, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.h, 8, v60.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.h, 8, v61.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.l, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v62.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v63.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v48.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v72.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v49.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v73.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v50.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v51
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v81.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.l, 8, v52.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v53.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v54.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v83.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.l, 8, v55.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.h, 8, v64.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v85.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.h, 8, v65.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v66.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v87.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v67.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v74.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v97.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.l, 8, v75.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v98.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.h, 8, v76.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v77.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v78.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.l, 8, v79.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v102.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.h, 8, v88.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v89.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v160.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v90.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v91.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v161.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v92.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v93.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v162.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v94.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v95.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v104.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v164.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v105.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v164.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v106.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v165.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v107.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v31.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v108.l
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB74_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
-; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB74_4
-; GFX11-TRUE16-NEXT:  .LBB74_2: ; %end
-; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB74_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v1.h, v150.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v117.l
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB74_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v101.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.h, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v0.l, v103.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v103.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v102.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v101, v0
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v1.h, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v100.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v2.l, v100.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v97.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v101, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v3.l, v99.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v99.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v87.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v101, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v4.l, v96.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v85.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v69.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v15.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v82.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v71.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v68.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v18.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v66.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v54.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v21.l, v149.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v53.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v47.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v46.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v101, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v5.l, v87.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v81.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v45.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v44.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v101, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v6.l, v85.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v43.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v42.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v41.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v101, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v7.l, v83.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v40.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v183.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v182.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v101, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v8.l, v81.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v68.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v67.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v181.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v180.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v101, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v9.l, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v179.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v66.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v178.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v177.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v101, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v10.l, v69.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v65.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v176.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v167.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v166.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v101, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v11.l, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v55.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v165.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v164.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v101, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v12.l, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v163.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v162.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v101, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v13.l, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v160.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v101, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v14.l, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v51.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v149.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v148.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v101, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v15.l, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v147.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v50.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v146.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v101, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v16.l, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v49.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v144.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v101, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v17.l, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v39.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v133.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v132.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v101, v17
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v18.l, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v101, v18
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v19.l, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v128.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v101, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v20.l, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v116.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v101, v20
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v21.l, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v101.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v101, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v22.l, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v112.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v101, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v23.l, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v32.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v24.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v101, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v24.l, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v101, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v25.l, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v101, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v26.l, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v101, v26
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v27.l, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v28.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v101, v27
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v28.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v101, v28
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v29.l, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT:    v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v31.l, v149.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v101, v29
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v30.l, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v101, v30
+; GFX11-TRUE16-NEXT:    v_or_b16 v101.l, v31.l, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v101.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v101, v31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:  .LBB74_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB74_2
-; GFX11-TRUE16-NEXT:  .LBB74_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB74_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v101.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v101.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v98.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v98.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v145.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v103.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v102.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v102.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v97.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v96.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v134.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v86.h, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v31, v3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v86.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v148.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v100.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v148.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v100.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v31, v4
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v31.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v99.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v99.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v84.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v83.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v31, v5
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v130.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v82.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v82.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v96.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v97.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v31, v6
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v87.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v87.h, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v71.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v71.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v31, v7
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v117.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v70.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v69.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v85.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v85.h, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v31, v8
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v132.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v83.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v84.l, v6.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v47.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v46.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v31, v9
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v112.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v45.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v44.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v81.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v81.h, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v31, v10
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v9.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v103.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v80.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v80.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v43.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v42.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v31, v11
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v41.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v40.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v69.h, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v70.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v31, v12
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v68.l, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v68.h, v10.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v183.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v182.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v31, v13
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v96.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v181.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v180.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v67.l, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v67.h, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v31, v14
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v86.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v66.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v66.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v179.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v178.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v31, v15
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v84.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v177.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v176.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v65.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v65.h, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v31, v16
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, 0x300, v15.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v64.l, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v64.h, v14.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v167.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v166.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v31, v17
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v15.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v80.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, v165.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, v164.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v55.l, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v55.h, v15.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v16.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v31, v18
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v17.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.h, 0x300, v17.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v54.l, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v54.h, v16.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v163.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v162.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v31, v19
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v17.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, 0x300, v17.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v18.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v161.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v160.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v53.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v53.h, v17.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v31, v20
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v19.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v19.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v52.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v52.h, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v151.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v150.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v31, v21
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v19.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, 0x300, v19.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v20.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, v149.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, v148.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v51.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v51.h, v19.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v20.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v31, v22
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v21.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, 0x300, v21.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v50.l, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v50.h, v20.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v147.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v146.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v31, v23
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v21.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, 0x300, v21.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v22.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v145.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v144.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v49.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v49.h, v21.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v31, v24
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v23.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.h, 0x300, v23.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v48.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v48.h, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v135.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v134.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v31, v25
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v23.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, 0x300, v23.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v133.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, v132.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v39.l, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v39.h, v23.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v24.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v26
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v25.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v25.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v38.l, v24.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v38.h, v24.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v130.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v27
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v25.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, 0x300, v25.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v26.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, v129.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, v128.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v37.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v37.h, v25.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v26.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v31, v28
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v27.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.h, 0x300, v27.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v36.l, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v36.h, v26.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v118.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v31, v29
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v27.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, 0x300, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v117.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v116.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v35.l, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v35.h, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v31, v30
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v31, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v35.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v34.l, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v34.h, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v115.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v114.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v31, v35
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v34.h, 0x300, v29.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, v113.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, v112.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v33.l, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.h, v33.h, v29.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v31, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v33.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v33.h, 0x300, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v32.l, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v32.h, v30.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v31, v33
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v32.l
@@ -122288,7 +119736,48 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v32.h, 0x300, v32.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v31.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT:  .LBB74_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:392
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:396
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:400
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:404
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:408
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:412
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:416
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:420
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:424
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:428
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:432
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:436
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:440
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:444
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:448
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:452
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:456
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:460
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:464
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:468
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:472
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:476
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:480
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:484
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:488
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:492
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:496
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:500
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:504
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:508
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:512
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:516
+; GFX11-TRUE16-NEXT:    s_clause 0x4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:520
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:524
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:528
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:532
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:536
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64:
@@ -126554,1887 +124043,946 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:464
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:460
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:456
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:452
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:448
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:444
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:440
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:436
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:432
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:428
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:424
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:420
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:416
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:412
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:408
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:404
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:400
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:396
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:392
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:320
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v2, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v8, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v10, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v16, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v18, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v20, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v22, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v24, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v26, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v28, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v30, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:152
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v113, off, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v116, off, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v117, off, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v128, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v130, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v94, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v95, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v104, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v105, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v106, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v107, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v108, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v109, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v110, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v111, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:260
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v102, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v103, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v112, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v118, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v164, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v166, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v180, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v181, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v182, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v183, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v45, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v46, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v47, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v56, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v62, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(60)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(58)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB75_4
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v53
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v90
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v91
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s11, s26, 0xff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v50
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v49
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v76
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v77
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v39
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v88
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v63
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v72
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v73
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v33
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v74
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v75
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v62
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v57
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v58
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v56
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v47
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v60
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v46
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v45
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v61
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v40
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v41
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v42
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v180
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v43
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v166
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v165
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v167
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v176
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v164
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v163
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v177
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v178
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v162
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v148
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v179
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v149
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v150
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v151
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v118
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v160
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v161
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v112
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v103
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v132
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v133
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v102
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v101
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v134
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v135
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v86
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v144
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v119
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v85
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v84
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v128
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v129
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v130
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v81
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v80
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v113
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v114
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v70
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v115
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v116
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v68
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v117
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v87
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v96
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v97
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v64
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v98
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v99
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s10, 16
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v51
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v52
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v93
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v92
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB75_3
-; GFX11-TRUE16-NEXT:  .LBB75_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v55
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v54
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v52
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v51
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v38
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v34
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v89, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v90, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v91, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v92, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v93, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v88, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v74, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v75, v12
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v3, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v50
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v49
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v48
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v36
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 3, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v46
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 3, v181
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 3, v180
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v76, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v77, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v78, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v79, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v63, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v72, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v73, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v61, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v43, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v44, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v14, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v32
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v62
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v56
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v45
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 3, v183
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 3, v182
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 3, v162
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 3, v145
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 3, v118
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v57, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v58, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v59, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v60, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v40, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v41, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v42, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v179, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v160, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v161, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v19, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v166
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v165
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v164
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 3, v148
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 3, v147
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v146
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 3, v100
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 3, v83
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 3, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v167, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v176, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v177, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v178, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v150, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v151, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v144, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v130, v26
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v131, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v19, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v24, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v112
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v103
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v102
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v101
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 3, v86
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 3, v85
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 3, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 3, v69
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v132, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v133, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v134, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v135, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v119, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v128, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v129, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v117, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v24, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v29, v25
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v81
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v80
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v71
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v70
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 3, v68
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 3, v67
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 3, v66
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 3, v65
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 3, v64
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v113, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v114, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v115, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v116, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v87, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v96, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v97, v30
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v98, v31
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, v99, v32
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v29, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v34, v30
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v32
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:  .LBB75_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:392
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:396
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:400
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:404
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:408
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:412
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:416
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:420
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:424
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:428
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:432
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:460
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:464
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:468
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:472
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:476
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB75_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT:    s_branch .LBB75_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:464
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:460
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:456
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:452
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:448
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:444
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:440
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:436
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:432
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:428
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:424
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:420
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:416
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:412
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:408
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:404
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:400
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:396
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:392
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:388
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v107, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v108, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v109, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v110, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v111, s32 offset:320
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:112
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:128
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:136
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:144
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:152
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:160
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:168
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v114, off, s32 offset:176
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32 offset:184
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:192
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:200
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v128, off, s32 offset:208
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:216
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:224
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:232
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:240
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v95, off, s32 offset:248
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v104, off, s32 offset:256
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v105, off, s32 offset:264
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v106, off, s32 offset:272
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v107, off, s32 offset:280
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v108, off, s32 offset:288
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v109, off, s32 offset:296
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v110, off, s32 offset:304
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v111, off, s32 offset:312
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:308
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:300
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:292
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:284
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:276
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:268
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:260
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:236
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:228
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:220
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:212
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:204
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:196
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:188
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:180
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:172
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:164
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:156
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:148
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v145, off, s32 offset:140
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v146, off, s32 offset:132
-; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v147, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v148, off, s32 offset:116
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v162, off, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v163, off, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v164, off, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:84
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v166, off, s32 offset:76
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:68
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v181, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v45, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v46, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v47, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v56, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v62, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB75_4
-; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v54
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v53
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v90
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v91
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s11, s26, 0xff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v50
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v49
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v76
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v77
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v48
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v39
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v88
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v63
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v36
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v72
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v73
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v33
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v74
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v75
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v62
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v57
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v58
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v56
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v47
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v60
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v46
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v45
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v61
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v40
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v41
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v42
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v180
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v43
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v166
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v165
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v167
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v176
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v164
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v163
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v177
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v178
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v162
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v148
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v179
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v149
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v147
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v150
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v151
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v118
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v160
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v161
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v112
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v103
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v132
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v133
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v102
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v101
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v134
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v135
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v86
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v144
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v119
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v85
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v84
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v128
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v129
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v83
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v82
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v130
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v131
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v81
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v80
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v113
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v114
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v71
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v70
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v115
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v116
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v69
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v68
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v117
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v87
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v67
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v66
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v96
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v97
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v65
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v98
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v99
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v0, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v55
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s10, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v51
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v52
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v93
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v92
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v2, v3
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB75_3
-; GFX11-FAKE16-NEXT:  .LBB75_2: ; %cmp.true
-; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v55
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v54
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v52
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v51
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v38
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v34
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v89, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v90, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v92, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v93, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v88, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v74, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v75, v12
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v3, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v50
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v49
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v48
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v39
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v37
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v36
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 3, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v46
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 3, v181
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 3, v180
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v76, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v77, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v78, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v79, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v63, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v72, v8
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v73, v10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v61, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v43, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v44, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v14, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v32
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v62
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v56
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v47
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v45
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 3, v183
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 3, v182
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 3, v162
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 3, v145
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 3, v118
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v57, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v58, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v59, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v60, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v40, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v41, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v42, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v160, v21
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v161, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v18
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v19, v15
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v166
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v165
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v164
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v163
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 3, v148
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 3, v147
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 3, v146
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 3, v100
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 3, v83
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 3, v82
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v167, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v176, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v177, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v178, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v149, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v150, v18
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v151, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v130, v26
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v131, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v23
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v24, v20
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v112
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v103
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v102
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v101
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 3, v86
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 3, v85
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, 3, v84
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 3, v69
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v132, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v133, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v134, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v135, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v119, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v128, v23
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v129, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v117, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v28
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v29, v25
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v81
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v80
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v71
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v70
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 3, v68
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 3, v67
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 3, v66
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 3, v65
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 3, v64
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v113, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v114, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v115, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v116, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v87, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v96, v28
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v97, v30
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v98, v31
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v99, v32
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v34, v30
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v32
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT:  .LBB75_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v108, off, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v107, off, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v106, off, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v105, off, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v104, off, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v95, off, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:388
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:392
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:396
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:400
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:404
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:408
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:412
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:416
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:420
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:424
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:428
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:432
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:460
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:464
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:468
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:472
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:476
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB75_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT:    s_branch .LBB75_2
+; GFX11-LABEL: bitcast_v128i8_to_v16f64_scalar:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:476
+; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:472
+; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:468
+; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:464
+; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:460
+; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:456
+; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:452
+; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:448
+; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:444
+; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:440
+; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:436
+; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:432
+; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:428
+; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:424
+; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:420
+; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:416
+; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:412
+; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:408
+; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:404
+; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:400
+; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:396
+; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:392
+; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:388
+; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:384
+; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:380
+; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:376
+; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:372
+; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:368
+; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:364
+; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:360
+; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:356
+; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:352
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:348
+; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:344
+; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:340
+; GFX11-NEXT:    scratch_store_b32 off, v107, s32 offset:336
+; GFX11-NEXT:    scratch_store_b32 off, v108, s32 offset:332
+; GFX11-NEXT:    scratch_store_b32 off, v109, s32 offset:328
+; GFX11-NEXT:    scratch_store_b32 off, v110, s32 offset:324
+; GFX11-NEXT:    scratch_store_b32 off, v111, s32 offset:320
+; GFX11-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
+; GFX11-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
+; GFX11-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
+; GFX11-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
+; GFX11-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
+; GFX11-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
+; GFX11-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:316
+; GFX11-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u16 v16, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u16 v18, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_u16 v20, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_u16 v22, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_u16 v24, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_u16 v26, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_u16 v28, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_u16 v30, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_u16 v31, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_u16 v87, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:136
+; GFX11-NEXT:    scratch_load_u16 v97, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_u16 v98, off, s32 offset:152
+; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_u16 v113, off, s32 offset:168
+; GFX11-NEXT:    scratch_load_u16 v114, off, s32 offset:176
+; GFX11-NEXT:    scratch_load_u16 v115, off, s32 offset:184
+; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:192
+; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:200
+; GFX11-NEXT:    scratch_load_u16 v128, off, s32 offset:208
+; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:216
+; GFX11-NEXT:    scratch_load_u16 v130, off, s32 offset:224
+; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:232
+; GFX11-NEXT:    scratch_load_u16 v94, off, s32 offset:240
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_u16 v95, off, s32 offset:248
+; GFX11-NEXT:    scratch_load_u16 v104, off, s32 offset:256
+; GFX11-NEXT:    scratch_load_u16 v105, off, s32 offset:264
+; GFX11-NEXT:    scratch_load_u16 v106, off, s32 offset:272
+; GFX11-NEXT:    scratch_load_u16 v107, off, s32 offset:280
+; GFX11-NEXT:    scratch_load_u16 v108, off, s32 offset:288
+; GFX11-NEXT:    scratch_load_u16 v109, off, s32 offset:296
+; GFX11-NEXT:    scratch_load_u16 v110, off, s32 offset:304
+; GFX11-NEXT:    scratch_load_u16 v111, off, s32 offset:312
+; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:308
+; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:300
+; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:292
+; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:284
+; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:276
+; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:268
+; GFX11-NEXT:    scratch_load_u16 v70, off, s32 offset:260
+; GFX11-NEXT:    scratch_load_u16 v71, off, s32 offset:252
+; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:244
+; GFX11-NEXT:    scratch_load_u16 v81, off, s32 offset:236
+; GFX11-NEXT:    scratch_load_u16 v82, off, s32 offset:228
+; GFX11-NEXT:    scratch_load_u16 v83, off, s32 offset:220
+; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:212
+; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:204
+; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:196
+; GFX11-NEXT:    scratch_load_u16 v100, off, s32 offset:188
+; GFX11-NEXT:    scratch_load_u16 v101, off, s32 offset:180
+; GFX11-NEXT:    scratch_load_u16 v102, off, s32 offset:172
+; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:164
+; GFX11-NEXT:    scratch_load_u16 v112, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_u16 v118, off, s32 offset:148
+; GFX11-NEXT:    scratch_load_u16 v145, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_u16 v146, off, s32 offset:132
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_load_u16 v147, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_u16 v148, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_u16 v162, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_u16 v163, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_u16 v164, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_u16 v165, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_u16 v166, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_u16 v180, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_u16 v181, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u16 v182, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_u16 v183, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u16 v45, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u16 v46, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u16 v47, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v56, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v62, off, s32 offset:4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v89, 8, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v90, 8, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v91, 8, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v92, 8, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v93, 8, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v76, 8, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v77, 8, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v78, 8, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v79, 8, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v88, 8, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v63, 8, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v72, 8, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v73, 8, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v74, 8, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v75, 8, v29
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v57, 8, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v58, 8, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v59, 8, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v60, 8, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v61, 8, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v40, 8, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v41, 8, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v42, 8, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v43, 8, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v44, 8, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v167, 8, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v176, 8, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v177, 8, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v178, 8, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v179, 8, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v149, 8, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v150, 8, v87
+; GFX11-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v151, 8, v96
+; GFX11-NEXT:    s_waitcnt vmcnt(60)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v160, 8, v97
+; GFX11-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v161, 8, v98
+; GFX11-NEXT:    s_waitcnt vmcnt(58)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v132, 8, v99
+; GFX11-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v133, 8, v113
+; GFX11-NEXT:    s_waitcnt vmcnt(56)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v134, 8, v114
+; GFX11-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v135, 8, v115
+; GFX11-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v144, 8, v116
+; GFX11-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v119, 8, v117
+; GFX11-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v128, 8, v128
+; GFX11-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v129, 8, v129
+; GFX11-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v130, 8, v130
+; GFX11-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v131, 8, v131
+; GFX11-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v113, 8, v94
+; GFX11-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v114, 8, v95
+; GFX11-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v115, 8, v104
+; GFX11-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v116, 8, v105
+; GFX11-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v117, 8, v106
+; GFX11-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v87, 8, v107
+; GFX11-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v96, 8, v108
+; GFX11-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v97, 8, v109
+; GFX11-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v98, 8, v110
+; GFX11-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v99, 8, v111
+; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cbranch_scc0 .LBB75_4
+; GFX11-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v54
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v53
+; GFX11-NEXT:    s_and_b32 s5, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s29, 8
+; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v90
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v91
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-NEXT:    s_and_b32 s11, s26, 0xff
+; GFX11-NEXT:    v_or_b32_e32 v5, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v50
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v49
+; GFX11-NEXT:    s_lshl_b32 s12, s27, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v76
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v77
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v48
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v39
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v78
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v79
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v8, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v38
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v37
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v88
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v63
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v9, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v36
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v72
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v73
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v10, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v33
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v74
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v75
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v62
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v57
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v58
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v56
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v47
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v59
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v60
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v13, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v46
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v45
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v61
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v40
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v14, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v183
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v182
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v41
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v42
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v15, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v181
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v180
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v43
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v44
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v16, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v166
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v165
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v167
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v176
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v164
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v163
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v177
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v178
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v18, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v162
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v148
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v179
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v149
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v19, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v147
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v146
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v150
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v151
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v20, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v145
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v118
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v160
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v161
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v21, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v112
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v103
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v132
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v133
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v102
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v101
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v134
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v135
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v23, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v100
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v86
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v144
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v119
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v24, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v85
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v84
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v128
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v129
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v25, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v83
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v82
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v130
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v131
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v26, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v81
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v80
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v113
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v114
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v71
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v70
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v115
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v116
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v28, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v69
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v68
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v117
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v87
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v29, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v67
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v66
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v96
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v97
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v30, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v65
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v98
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v99
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v31, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v55
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v89
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX11-NEXT:    s_and_b32 s9, s9, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s10, s10, 16
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v51
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v52
+; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v93
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v92
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v6, v2, v3
+; GFX11-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_vccnz .LBB75_3
+; GFX11-NEXT:  .LBB75_2: ; %cmp.true
+; GFX11-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_and_b32 s1, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s2, s17, 8
+; GFX11-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-NEXT:    s_or_b32 s1, s2, s1
+; GFX11-NEXT:    s_and_b32 s2, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s19, 8
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_and_b32 s3, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s21, 8
+; GFX11-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-NEXT:    s_or_b32 s1, s1, s2
+; GFX11-NEXT:    s_or_b32 s2, s4, s3
+; GFX11-NEXT:    s_and_b32 s3, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s23, 8
+; GFX11-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-NEXT:    s_or_b32 s3, s4, s3
+; GFX11-NEXT:    s_and_b32 s4, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s25, 8
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_addk_i32 s3, 0x300
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT:    s_and_b32 s4, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s27, 8
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v55
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v54
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v52
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v51
+; GFX11-NEXT:    s_or_b32 s3, s3, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v53
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v38
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v34
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v33
+; GFX11-NEXT:    v_or_b32_e32 v0, v89, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v90, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v91, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v92, v3
+; GFX11-NEXT:    v_or_b32_e32 v4, v93, v4
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT:    v_or_b32_e32 v7, v88, v7
+; GFX11-NEXT:    v_or_b32_e32 v11, v74, v11
+; GFX11-NEXT:    v_or_b32_e32 v12, v75, v12
+; GFX11-NEXT:    s_and_b32 s5, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s29, 8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-NEXT:    v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    v_or_b32_e32 v6, v3, v6
+; GFX11-NEXT:    v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v50
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v49
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v48
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v39
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v37
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v36
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 3, v35
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v46
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 3, v181
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 3, v180
+; GFX11-NEXT:    v_or_b32_e32 v0, v76, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v77, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v78, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v79, v3
+; GFX11-NEXT:    v_or_b32_e32 v7, v63, v7
+; GFX11-NEXT:    v_or_b32_e32 v8, v72, v8
+; GFX11-NEXT:    v_or_b32_e32 v10, v73, v10
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x300, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT:    v_or_b32_e32 v12, v61, v12
+; GFX11-NEXT:    v_or_b32_e32 v16, v43, v16
+; GFX11-NEXT:    v_or_b32_e32 v17, v44, v17
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v8, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v9, v9, v13
+; GFX11-NEXT:    v_or_b32_e32 v10, v14, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v62
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v56
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v47
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v45
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 3, v183
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 3, v182
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT:    v_or_b32_e32 v16, v16, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 3, v162
+; GFX11-NEXT:    v_add_nc_u32_e32 v21, 3, v145
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 3, v118
+; GFX11-NEXT:    v_or_b32_e32 v0, v57, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v58, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v59, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v60, v3
+; GFX11-NEXT:    v_or_b32_e32 v12, v40, v12
+; GFX11-NEXT:    v_or_b32_e32 v13, v41, v13
+; GFX11-NEXT:    v_or_b32_e32 v15, v42, v15
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x300, v13
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT:    v_or_b32_e32 v17, v179, v17
+; GFX11-NEXT:    v_or_b32_e32 v21, v160, v21
+; GFX11-NEXT:    v_or_b32_e32 v22, v161, v22
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x300, v21
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT:    v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v13, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v14, v14, v18
+; GFX11-NEXT:    v_or_b32_e32 v15, v19, v15
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v166
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v165
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v164
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v163
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 3, v148
+; GFX11-NEXT:    v_add_nc_u32_e32 v18, 3, v147
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 3, v146
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-NEXT:    v_or_b32_e32 v21, v21, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 3, v100
+; GFX11-NEXT:    v_add_nc_u32_e32 v26, 3, v83
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 3, v82
+; GFX11-NEXT:    v_or_b32_e32 v0, v167, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v176, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v177, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v178, v3
+; GFX11-NEXT:    v_or_b32_e32 v17, v149, v17
+; GFX11-NEXT:    v_or_b32_e32 v18, v150, v18
+; GFX11-NEXT:    v_or_b32_e32 v20, v151, v20
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x300, v18
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x300, v20
+; GFX11-NEXT:    v_or_b32_e32 v22, v144, v22
+; GFX11-NEXT:    v_or_b32_e32 v26, v130, v26
+; GFX11-NEXT:    v_or_b32_e32 v27, v131, v27
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v26, 0x300, v26
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT:    v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v18, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v19, v19, v23
+; GFX11-NEXT:    v_or_b32_e32 v20, v24, v20
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v112
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v103
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v102
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v101
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 3, v86
+; GFX11-NEXT:    v_add_nc_u32_e32 v23, 3, v85
+; GFX11-NEXT:    v_add_nc_u32_e32 v25, 3, v84
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v23
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-NEXT:    v_or_b32_e32 v26, v26, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 3, v69
+; GFX11-NEXT:    v_or_b32_e32 v0, v132, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v133, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v134, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v135, v3
+; GFX11-NEXT:    v_or_b32_e32 v22, v119, v22
+; GFX11-NEXT:    v_or_b32_e32 v23, v128, v23
+; GFX11-NEXT:    v_or_b32_e32 v25, v129, v25
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x300, v23
+; GFX11-NEXT:    v_add_nc_u32_e32 v25, 0x300, v25
+; GFX11-NEXT:    v_or_b32_e32 v27, v117, v27
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT:    v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v23, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v24, v24, v28
+; GFX11-NEXT:    v_or_b32_e32 v25, v29, v25
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v81
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v80
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v71
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v70
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 3, v68
+; GFX11-NEXT:    v_add_nc_u32_e32 v28, 3, v67
+; GFX11-NEXT:    v_add_nc_u32_e32 v30, 3, v66
+; GFX11-NEXT:    v_add_nc_u32_e32 v31, 3, v65
+; GFX11-NEXT:    v_add_nc_u32_e32 v32, 3, v64
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v31
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v32
+; GFX11-NEXT:    v_or_b32_e32 v0, v113, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v114, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v115, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v116, v3
+; GFX11-NEXT:    v_or_b32_e32 v27, v87, v27
+; GFX11-NEXT:    v_or_b32_e32 v28, v96, v28
+; GFX11-NEXT:    v_or_b32_e32 v30, v97, v30
+; GFX11-NEXT:    v_or_b32_e32 v31, v98, v31
+; GFX11-NEXT:    v_or_b32_e32 v32, v99, v32
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x300, v28
+; GFX11-NEXT:    v_add_nc_u32_e32 v30, 0x300, v30
+; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x300, v31
+; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x300, v32
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-NEXT:    v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v28, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v29, v29, v33
+; GFX11-NEXT:    v_or_b32_e32 v30, v34, v30
+; GFX11-NEXT:    v_or_b32_e32 v31, v31, v32
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:  .LBB75_3: ; %end
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:320
+; GFX11-NEXT:    scratch_load_b32 v110, off, s32 offset:324
+; GFX11-NEXT:    scratch_load_b32 v109, off, s32 offset:328
+; GFX11-NEXT:    scratch_load_b32 v108, off, s32 offset:332
+; GFX11-NEXT:    scratch_load_b32 v107, off, s32 offset:336
+; GFX11-NEXT:    scratch_load_b32 v106, off, s32 offset:340
+; GFX11-NEXT:    scratch_load_b32 v105, off, s32 offset:344
+; GFX11-NEXT:    scratch_load_b32 v104, off, s32 offset:348
+; GFX11-NEXT:    scratch_load_b32 v95, off, s32 offset:352
+; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:356
+; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:360
+; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:364
+; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:368
+; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:372
+; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:376
+; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:380
+; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:384
+; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:388
+; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:392
+; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:396
+; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:400
+; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:404
+; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:408
+; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:412
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:416
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:420
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:424
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:428
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:432
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:436
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:440
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:444
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:448
+; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:452
+; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:456
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:460
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:464
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:468
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:472
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:476
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-NEXT:  .LBB75_4:
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-NEXT:    s_branch .LBB75_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -147206,766 +143754,814 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v150, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v150, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v147, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v149, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v149, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v148, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v146, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v145, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v134, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v144, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:520
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:516
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:512
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:508
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:504
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:500
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:496
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:492
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:488
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:484
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:480
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:476
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:472
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:468
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:464
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:460
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:456
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:452
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:448
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:444
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:440
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:436
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:432
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:428
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:424
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:420
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:416
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:412
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:408
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:404
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:400
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:396
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:392 ; 4-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v144, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v135, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v132, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v135, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v134, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v133, off, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v160, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v103, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v113, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v114, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v116, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v116, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v117, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v117, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v118, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v118, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v119, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v128, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v129, off, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v130, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v114, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v103, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v128, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v56, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v117, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v102, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v133, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v57, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v112, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v58, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v132, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v101, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v118, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v59, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v134, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v97, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v113, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v100, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v145, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v99, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v119, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v60, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v135, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v98, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v129, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v61, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v146, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v62, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v115, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v63, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v144, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v130, off, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v131, off, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v132, off, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v133, off, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v151, off, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v151, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v96, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v130, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v147, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v84, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v116, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v87, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v148, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v86, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v131, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v73, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v65, off, s32 offset:388
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v53, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v66, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v67, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v68, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v69, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v70, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v71, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v80, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v81, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v82, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v83, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v85, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v74, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v75, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v76, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v77, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v78, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v79, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v88, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v89, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v90, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v91, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v92, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v93, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v95, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v104, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v179, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v163, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v183, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v150, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v167, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v161, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v177, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v149, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v180, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v151, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v164, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v41, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v47, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v165, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v43, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v181, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v45, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v162, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v46, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v176, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v42, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v178, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v44, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v160, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v182, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, v26.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v24.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.h, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v29.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v149.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v148.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v145.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v148.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v147.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v146.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v145.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v134.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v160
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v166, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v40, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v19.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v56.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v57.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.h, 8, v58.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v103.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v113.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.h, 8, v113.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v114.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.l, 8, v114.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.h, 8, v115.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v116.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v116.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v117.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v117.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v59.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.l, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v60.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.l, 8, v98.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v61.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.h, 8, v62.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.h, 8, v63.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.l, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v72.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.l, 8, v84.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.l, 8, v86.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v73.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v65
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.l, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.l, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.l, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v85.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.h, 8, v118.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v74.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.l, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.l, 8, v75.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v119.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v76.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.h, 8, v77.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v128.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.l, 8, v78.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.h, 8, v79.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v129.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v88.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v89.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v130.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.h, 8, v90.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v91.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.l, 8, v131.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v92.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v132.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v93.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v94.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.h, 8, v95.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v151.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v31.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v104.l
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB88_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
-; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB88_4
-; GFX11-TRUE16-NEXT:  .LBB88_2: ; %end
-; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB88_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v51.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v53.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v51.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v54.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v65.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v64.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v67.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB88_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v35.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v66.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v68.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v68.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v71.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v69.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v82.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v85.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v86.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v80.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v87.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v71.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v96.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v97.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v100.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v48.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v50.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v50.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v39.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v81.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v81.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v82.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v83.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v97.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v70.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v98.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v99.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v99.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v87.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v101.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v102.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v102.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v103.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v112.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v101.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v112.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v113.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v113.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v103.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v115.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v115.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v117.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v114.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v118.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v118.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v119.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v114.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v119.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v128.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v128.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v129.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v130.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v117.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v130.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v131.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v131.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v129.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v132.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v133.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v133.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v134.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v135.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v132.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v135.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v144.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v144.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v134.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v25.l, v145.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v146.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v26.l, v146.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v147.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v148.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v145.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v148.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v149.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v147.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v30.l, v150.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v150.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v31.l, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v151.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v40.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v166.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v182.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v160.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v44.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v178.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v42.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v176.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v46.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v162.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v45.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v181.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v43.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v165.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v47.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v41.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v164.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v180.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v149.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v177.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v161.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v167.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v183.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v163.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v179.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v148.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v116.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v147.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v115.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v129.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v119.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v113.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v134.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v118.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v132.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v133.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v117.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v128.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v49.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v50.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v53.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v54.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v65.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v55.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v67.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v67.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v68.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v69.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v66.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v70.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v70.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v66.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v71.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v80.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v80.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v82.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v69.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v82.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v83.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v81.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v84.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v85.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v85.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v87.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v96.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v86.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v25.l, v97.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v98.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v26.l, v98.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v99.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v97.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v100.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v99.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v30.l, v102.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v102.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v31.l, v103.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v103.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT:  .LBB88_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB88_2
-; GFX11-TRUE16-NEXT:  .LBB88_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v50.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v39.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v50.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v39.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v48.l, 3
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB88_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v128.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v114.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v133.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v117.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v145.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v150.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v150.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v148.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v103.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v102.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v102.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v100.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v49.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v48.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v132.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v112.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v134.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v118.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v149.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v147.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v148.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v149.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v129.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v101.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v99.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v100.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v101.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v38.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v37.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v113.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v135.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v98.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v145.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v146.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v147.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v145.h, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v97.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v98.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v99.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v97.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v148.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v144.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v115.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v147.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v130.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v135.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v144.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v134.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v135.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v144.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v87.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v96.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v86.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v87.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v96.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v116.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v100.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v179.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v131.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(24)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v100.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v98.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v183.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v163.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v132.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v133.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v134.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v132.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v133.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v84.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v85.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v86.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v84.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v85.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v97.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v85.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v167.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v150.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v96.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v86.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v177.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v161.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v87.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v131.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v129.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v130.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v131.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v180.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v83.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v81.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v82.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v83.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v71.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v149.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v84.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v80.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v164.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v151.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v96.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v130.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v47.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v82.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v117.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v128.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v129.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v119.h, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v86.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v69.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v80.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v81.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v71.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v41.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v85.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v80.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v43.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v165.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v84.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v82.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v45.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v181.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v128.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v119.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v114.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v118.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v118.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v80.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v71.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v66.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v70.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v70.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v83.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v46.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v69.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v162.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v71.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v69.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v42.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v176.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v70.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v44.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v117.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v114.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v116.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v116.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v115.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v69.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v66.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v68.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v68.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v67.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v68.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v178.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v68.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v66.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v182.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v160.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v67.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v66.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v40.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v166.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v115.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v113.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v103.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v112.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v113.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v67.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v65.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v55.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v64.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v65.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v39.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v54.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v65.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v64.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v38.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v112.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v101.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v102.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v103.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v101.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v64.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v53.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v55.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v53.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v64.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v37.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v52.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v55.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v53.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v35.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v102.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v99.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v87.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v98.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v99.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v54.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v50.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v51.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v52.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v54.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v36.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v49.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v53.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v51.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v52.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v51.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v97.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v70.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v82.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v83.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v81.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v81.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v51.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v48.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v49.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v50.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v48.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v49.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, 0x300, v32.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v32.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v33.l
+; GFX11-TRUE16-NEXT:  .LBB88_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:392
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:396
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:400
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:404
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:408
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:412
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:416
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:420
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:424
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:428
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:432
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:436
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:440
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:444
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:448
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:452
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:456
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:460
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:464
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:468
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:472
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:476
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:480
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:484
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:488
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:492
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:496
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:500
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:504
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:508
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:512
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:516
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:520 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16:
@@ -152736,1657 +149332,831 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:428
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:424
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:420
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:416
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:412
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:408
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:404
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:400
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:396
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:392
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:320
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v2, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v8, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v10, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v16, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v18, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v20, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v22, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v24, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v26, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v28, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v30, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v41, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v44, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v45, off, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v56, off, s32 offset:152
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v59, off, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v60, off, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v61, off, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v62, off, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v63, off, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v72, off, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v73, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v74, off, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v75, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v76, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v77, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v78, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v79, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v88, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v89, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v90, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v91, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v92, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v93, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v94, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v57, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v58, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v46, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v47, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v40, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v43, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v182, off, s32 offset:260
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v183, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v179, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v181, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v164, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v176, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v160, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v150, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v135, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v119, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v178, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v149, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v151, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v144, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v133, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v117, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v118, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v116, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v103, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v29
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 8, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v102, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v130, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v134, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v132, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v161, 8, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v147, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v166, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v167, 8, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v180, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v177, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v42, 8, v41
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v41, 8, v44
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(60)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v45, 8, v45
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v44, 8, v56
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(58)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v59, 8, v59
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v56, 8, v60
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v60, 8, v61
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v61, 8, v62
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v63, 8, v63
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v62, 8, v72
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v73, 8, v73
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v72, 8, v74
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v75, 8, v75
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v74, 8, v76
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v77, 8, v77
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v76, 8, v78
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v78, 8, v79
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v79, 8, v88
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v89, 8, v89
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v88, 8, v90
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v91, 8, v91
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v90, 8, v92
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v92, 8, v93
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v93, 8, v94
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB89_4
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v5, 0xffff, s5
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s11, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v68
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v4, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v48
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v7, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v8, v81
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v86
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v96
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v85
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v10, v97
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v11, v87
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v99
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v103
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v114
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v98
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v0, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v113
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v101
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v116
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v14, v128
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v112
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v117
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v102
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v130
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v133
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v14, v132
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v0, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v148
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v118
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xff, v129
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v161
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v166
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v144
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v134
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v18, v147
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v167
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v149
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v180
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v177
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v165
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v162
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v42
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v41
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v178
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v115
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v45
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v119
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v56
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v135
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v60
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v61
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v63
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v62
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v160
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v73
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v72
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v176
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v164
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v75
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v74
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v179
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v77
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v76
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v40
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v88
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v46
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v91
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v90
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v57
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v92
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v93
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB89_3
-; GFX11-TRUE16-NEXT:  .LBB89_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v58
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v57
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v92, v0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v46
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v93, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v91, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v43
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v40
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v90, v3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v183
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v182
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v89, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v181
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v88, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v181, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v78, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v79, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v29, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v179
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v182, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v179, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v77, v3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v176
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v164
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v163
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v163, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v76, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v160
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v75, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v160, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v74, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v73, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v150
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v72, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v146
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v145
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v135
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v63, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v131
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v62, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v131, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v60, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v61, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v24, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v119
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v135, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v119, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v59, v3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v178
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v115
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v165
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v115, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v56, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v162
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v45, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v145, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v44, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v42, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v151
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v41, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v149
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v148
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v144
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v180, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v133
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v177, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v166, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v167, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v133, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v129
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v144, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v161, v3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v118
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v117
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v116
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v116, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v147, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v99
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v132, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v130, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v103
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v98
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v54
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v39
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 3, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v113, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v128, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v100
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v101, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v102, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v96
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v134, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v97, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v87, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v86, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v85, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v84, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v50
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v49
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v83, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v82, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v81, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v71, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v80, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v70, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 3, v35
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v69, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v112, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v68, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v67, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v66, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v32
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v65, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v51
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v116
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v129
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v115
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v135
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v119, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v182
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v2, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v179, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
-; GFX11-TRUE16-NEXT:  .LBB89_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:392
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:396
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:400
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:404
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:408
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:412
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:416
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:420
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:424
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:428
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:432
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:436
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:440
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB89_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT:    s_branch .LBB89_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:428
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:424
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:420
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:416
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:412
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:408
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:404
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:400
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:396
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:392
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:388
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:320
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:112
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v41, off, s32 offset:128
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v44, off, s32 offset:136
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v45, off, s32 offset:144
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v56, off, s32 offset:152
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v59, off, s32 offset:160
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v60, off, s32 offset:168
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v61, off, s32 offset:176
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v62, off, s32 offset:184
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v63, off, s32 offset:192
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v72, off, s32 offset:200
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v73, off, s32 offset:208
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v74, off, s32 offset:216
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v75, off, s32 offset:224
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v76, off, s32 offset:232
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v77, off, s32 offset:240
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v78, off, s32 offset:248
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v79, off, s32 offset:256
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v88, off, s32 offset:264
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v89, off, s32 offset:272
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v90, off, s32 offset:280
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v91, off, s32 offset:288
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:296
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v93, off, s32 offset:304
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:312
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v57, off, s32 offset:308
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v58, off, s32 offset:300
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v46, off, s32 offset:292
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v47, off, s32 offset:284
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v40, off, s32 offset:276
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v43, off, s32 offset:268
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32 offset:260
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v178, off, s32 offset:244
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v181, off, s32 offset:236
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v164, off, s32 offset:228
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v176, off, s32 offset:220
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v160, off, s32 offset:212
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v163, off, s32 offset:204
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v146, off, s32 offset:196
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v150, off, s32 offset:188
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v135, off, s32 offset:180
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v145, off, s32 offset:172
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:164
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:156
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32 offset:148
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v179, off, s32 offset:140
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v162, off, s32 offset:132
-; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v149, off, s32 offset:116
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v151, off, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v144, off, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v148, off, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:84
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v133, off, s32 offset:76
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:68
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v119, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v114, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v80, 8, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 8, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 8, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v29
-; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 8, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 8, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 8, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 8, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 8, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 8, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v130, 8, v14
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 8, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 8, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 8, v20
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v161, 8, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 8, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v166, 8, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v167, 8, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v180, 8, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v177, 8, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v42, 8, v41
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v41, 8, v44
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v45, 8, v45
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v44, 8, v56
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v59, 8, v59
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v56, 8, v60
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v60, 8, v61
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v61, 8, v62
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v63, 8, v63
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v62, 8, v72
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v73, 8, v73
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v72, 8, v74
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v75, 8, v75
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v74, 8, v76
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v77, 8, v77
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v76, 8, v78
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v78, 8, v79
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v79, 8, v88
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v89, 8, v89
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v88, 8, v90
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v91, 8, v91
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v90, 8, v92
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v92, 8, v93
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v93, 8, v94
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB89_4
-; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT:    v_and_b32_e64 v5, 0xffff, s5
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s11, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v68
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v64
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v66
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v4, v67
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v65
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v39
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v50
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v71
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v48
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v69
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v82
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v7, v80
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v8, v81
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v53
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v55
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v51
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v84
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v52
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v54
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v86
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v83
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v96
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v85
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v10, v97
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v11, v87
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v99
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v103
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v114
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v98
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v0, 16, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v113
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v101
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v116
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v14, v128
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v112
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v117
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v102
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v130
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v133
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v14, v132
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v0, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v148
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v119
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v129
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v161
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v166
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v144
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v134
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v147
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v167
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v15
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s7
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v151
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v149
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v180
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v177
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v165
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v162
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v42
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v41
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v179
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v115
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v45
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v131
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v118
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v56
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v135
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v60
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v61
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v150
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v63
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v62
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v163
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v160
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v73
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v72
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v176
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v164
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v75
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v74
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v178
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v77
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v76
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v43
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v40
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v88
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v47
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v46
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v91
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v90
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v58
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v57
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v92
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v93
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB89_3
-; GFX11-FAKE16-NEXT:  .LBB89_2: ; %cmp.true
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v58
-; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v57
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v47
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v92, v0
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v46
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v93, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v43
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v40
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v90, v3
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v183
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v182
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v89, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v181
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v88, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v181, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v78, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v79, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v29, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v178
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v182, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v178, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v77, v3
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v176
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v164
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v163
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v163, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v76, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v160
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v75, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v160, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v74, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v73, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v150
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v72, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v146
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v145
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v135
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v63, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v131
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v62, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v131, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v60, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v61, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v118
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v135, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v118, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v59, v3
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v179
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v115
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v165
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v115, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v56, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v162
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v45, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v145, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v44, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v42, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v151
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v41, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v149
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v148
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v144
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v180, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v133
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v177, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v166, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v167, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v133, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v129
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v144, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v161, v3
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v119
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v117
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v116
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v116, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v147, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v114
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v99
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v132, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v130, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v103
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v98
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v54
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v39
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 3, v33
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v113, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v128, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v100
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v101, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v102, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v96
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v134, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v97, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v55
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v52
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v87, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v51
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v86, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v85, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v84, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v50
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v49
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v83, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v48
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v82, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v38
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v81, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v71, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v80, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v37
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v70, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v36
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 3, v35
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v69, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v112, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v68, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v67, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v66, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v33
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v32
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v65, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v64, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v33
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v51
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v38
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v116
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v129
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v17
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v115
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v135
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v131
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v27
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v118, 16, v33
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v163
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v182
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v181
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v28
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v2, 16, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v178, 16, v33
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
-; GFX11-FAKE16-NEXT:  .LBB89_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:388
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:392
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:396
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:400
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:404
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:408
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:412
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:416
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:420
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:424
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:428
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:432
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:436
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:440
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB89_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT:    s_branch .LBB89_2
+; GFX11-LABEL: bitcast_v128i8_to_v64bf16_scalar:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1e
+; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:440
+; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:436
+; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:432
+; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:428
+; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:424
+; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:420
+; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:416
+; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:412
+; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:408
+; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:404
+; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:400
+; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:396
+; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:392
+; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:388
+; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:384
+; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:380
+; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:376
+; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:372
+; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:368
+; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:364
+; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:360
+; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:356
+; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:352
+; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:348
+; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:344
+; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:340
+; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:336
+; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:332
+; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:328
+; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:324
+; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:320
+; GFX11-NEXT:    v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
+; GFX11-NEXT:    v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
+; GFX11-NEXT:    v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
+; GFX11-NEXT:    v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-NEXT:    v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
+; GFX11-NEXT:    v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
+; GFX11-NEXT:    v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:316
+; GFX11-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u16 v16, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u16 v18, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_u16 v20, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_u16 v22, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_u16 v24, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_u16 v26, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_u16 v28, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_u16 v30, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_u16 v31, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_u16 v41, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_u16 v44, off, s32 offset:136
+; GFX11-NEXT:    scratch_load_u16 v45, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_u16 v56, off, s32 offset:152
+; GFX11-NEXT:    scratch_load_u16 v59, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_u16 v60, off, s32 offset:168
+; GFX11-NEXT:    scratch_load_u16 v61, off, s32 offset:176
+; GFX11-NEXT:    scratch_load_u16 v62, off, s32 offset:184
+; GFX11-NEXT:    scratch_load_u16 v63, off, s32 offset:192
+; GFX11-NEXT:    scratch_load_u16 v72, off, s32 offset:200
+; GFX11-NEXT:    scratch_load_u16 v73, off, s32 offset:208
+; GFX11-NEXT:    scratch_load_u16 v74, off, s32 offset:216
+; GFX11-NEXT:    scratch_load_u16 v75, off, s32 offset:224
+; GFX11-NEXT:    scratch_load_u16 v76, off, s32 offset:232
+; GFX11-NEXT:    scratch_load_u16 v77, off, s32 offset:240
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_u16 v78, off, s32 offset:248
+; GFX11-NEXT:    scratch_load_u16 v79, off, s32 offset:256
+; GFX11-NEXT:    scratch_load_u16 v88, off, s32 offset:264
+; GFX11-NEXT:    scratch_load_u16 v89, off, s32 offset:272
+; GFX11-NEXT:    scratch_load_u16 v90, off, s32 offset:280
+; GFX11-NEXT:    scratch_load_u16 v91, off, s32 offset:288
+; GFX11-NEXT:    scratch_load_u16 v92, off, s32 offset:296
+; GFX11-NEXT:    scratch_load_u16 v93, off, s32 offset:304
+; GFX11-NEXT:    scratch_load_u16 v94, off, s32 offset:312
+; GFX11-NEXT:    scratch_load_u16 v57, off, s32 offset:308
+; GFX11-NEXT:    scratch_load_u16 v58, off, s32 offset:300
+; GFX11-NEXT:    scratch_load_u16 v46, off, s32 offset:292
+; GFX11-NEXT:    scratch_load_u16 v47, off, s32 offset:284
+; GFX11-NEXT:    scratch_load_u16 v40, off, s32 offset:276
+; GFX11-NEXT:    scratch_load_u16 v43, off, s32 offset:268
+; GFX11-NEXT:    scratch_load_u16 v182, off, s32 offset:260
+; GFX11-NEXT:    scratch_load_u16 v183, off, s32 offset:252
+; GFX11-NEXT:    scratch_load_u16 v178, off, s32 offset:244
+; GFX11-NEXT:    scratch_load_u16 v181, off, s32 offset:236
+; GFX11-NEXT:    scratch_load_u16 v164, off, s32 offset:228
+; GFX11-NEXT:    scratch_load_u16 v176, off, s32 offset:220
+; GFX11-NEXT:    scratch_load_u16 v160, off, s32 offset:212
+; GFX11-NEXT:    scratch_load_u16 v163, off, s32 offset:204
+; GFX11-NEXT:    scratch_load_u16 v146, off, s32 offset:196
+; GFX11-NEXT:    scratch_load_u16 v150, off, s32 offset:188
+; GFX11-NEXT:    scratch_load_u16 v135, off, s32 offset:180
+; GFX11-NEXT:    scratch_load_u16 v145, off, s32 offset:172
+; GFX11-NEXT:    scratch_load_u16 v118, off, s32 offset:164
+; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_u16 v115, off, s32 offset:148
+; GFX11-NEXT:    scratch_load_u16 v179, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_u16 v162, off, s32 offset:132
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_load_u16 v165, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_u16 v149, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_u16 v151, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_u16 v144, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_u16 v148, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_u16 v133, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_u16 v119, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u16 v114, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u16 v100, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u16 v98, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 8, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 8, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 8, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 8, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 8, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 8, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 8, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 8, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 8, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 8, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 8, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 8, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 8, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 8, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 8, v29
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v97, 8, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v87, 8, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v101, 8, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v102, 8, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v113, 8, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v112, 8, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v130, 8, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v128, 8, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v134, 8, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v132, 8, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v161, 8, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v147, 8, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v166, 8, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v167, 8, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v180, 8, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v177, 8, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v42, 8, v41
+; GFX11-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v41, 8, v44
+; GFX11-NEXT:    s_waitcnt vmcnt(60)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v45, 8, v45
+; GFX11-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v44, 8, v56
+; GFX11-NEXT:    s_waitcnt vmcnt(58)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v59, 8, v59
+; GFX11-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v56, 8, v60
+; GFX11-NEXT:    s_waitcnt vmcnt(56)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v60, 8, v61
+; GFX11-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v61, 8, v62
+; GFX11-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v63, 8, v63
+; GFX11-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v62, 8, v72
+; GFX11-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v73, 8, v73
+; GFX11-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v72, 8, v74
+; GFX11-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v75, 8, v75
+; GFX11-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v74, 8, v76
+; GFX11-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v77, 8, v77
+; GFX11-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v76, 8, v78
+; GFX11-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v78, 8, v79
+; GFX11-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v79, 8, v88
+; GFX11-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v89, 8, v89
+; GFX11-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v88, 8, v90
+; GFX11-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v91, 8, v91
+; GFX11-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v90, 8, v92
+; GFX11-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v92, 8, v93
+; GFX11-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v93, 8, v94
+; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cbranch_scc0 .LBB89_4
+; GFX11-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-NEXT:    s_and_b32 s5, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s29, 8
+; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-NEXT:    v_and_b32_e64 v5, 0xffff, s5
+; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-NEXT:    s_and_b32 s11, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s12, s27, 8
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-NEXT:    s_pack_ll_b32_b16 s8, s9, s10
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v36
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v32
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v34
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v33
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v68
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v64
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v66
+; GFX11-NEXT:    v_or_b32_e32 v6, v4, v67
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v65
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshl_or_b32 v4, v0, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v38
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v39
+; GFX11-NEXT:    v_lshl_or_b32 v6, v6, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v49
+; GFX11-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v37
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v70
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v50
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v71
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v48
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v69
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v82
+; GFX11-NEXT:    v_or_b32_e32 v9, v7, v80
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, v8, v81
+; GFX11-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v53
+; GFX11-NEXT:    v_lshl_or_b32 v8, v9, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v55
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v51
+; GFX11-NEXT:    v_lshl_or_b32 v9, v10, 16, v3
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v84
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v52
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v54
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v86
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v83
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v96
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v85
+; GFX11-NEXT:    v_or_b32_e32 v12, v10, v97
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v11, v87
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v99
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-NEXT:    v_lshl_or_b32 v11, v3, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v103
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v114
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v98
+; GFX11-NEXT:    v_lshl_or_b32 v12, v0, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v100
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v113
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v101
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v116
+; GFX11-NEXT:    v_or_b32_e32 v17, v14, v128
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v112
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v117
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v102
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_or_b32_e32 v13, v13, v130
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v133
+; GFX11-NEXT:    v_or_b32_e32 v20, v14, v132
+; GFX11-NEXT:    v_lshl_or_b32 v14, v0, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v148
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v119
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v129
+; GFX11-NEXT:    v_or_b32_e32 v16, v16, v161
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-NEXT:    v_lshl_or_b32 v13, v2, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v166
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v144
+; GFX11-NEXT:    v_or_b32_e32 v15, v15, v134
+; GFX11-NEXT:    v_or_b32_e32 v18, v18, v147
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v167
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v15
+; GFX11-NEXT:    v_lshl_or_b32 v15, v17, 16, v19
+; GFX11-NEXT:    v_lshl_or_b32 v17, v18, 16, v22
+; GFX11-NEXT:    v_mov_b32_e32 v2, s7
+; GFX11-NEXT:    v_lshl_or_b32 v18, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v151
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v149
+; GFX11-NEXT:    v_lshl_or_b32 v16, v20, 16, v21
+; GFX11-NEXT:    v_mov_b32_e32 v3, s8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v180
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v177
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v19, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v165
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v162
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v42
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v41
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v20, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v179
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v115
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v45
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v44
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v21, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v131
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v118
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v59
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v56
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v22, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v145
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v135
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v60
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v61
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v23, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v150
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v146
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v63
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v62
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v24, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v163
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v160
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v73
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v72
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v25, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v176
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v164
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v75
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v74
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v26, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v181
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v178
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v77
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v76
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v27, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v183
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v182
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v78
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v79
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v28, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v43
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v40
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v89
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v88
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v29, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v47
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v46
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v91
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v90
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v58
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v57
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v92
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v93
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_vccnz .LBB89_3
+; GFX11-NEXT:  .LBB89_2: ; %cmp.true
+; GFX11-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v58
+; GFX11-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v57
+; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v47
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v92, v0
+; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v46
+; GFX11-NEXT:    v_or_b32_e32 v1, v93, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v91, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v43
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x300, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v40
+; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x300, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v4
+; GFX11-NEXT:    v_or_b32_e32 v3, v90, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v183
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v182
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v89, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v30, 0x300, v3
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v181
+; GFX11-NEXT:    v_or_b32_e32 v0, v88, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v78, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v79, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v29, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v178
+; GFX11-NEXT:    v_add_nc_u32_e32 v182, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v178, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v77, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v176
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v164
+; GFX11-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v163
+; GFX11-NEXT:    v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v76, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v160
+; GFX11-NEXT:    v_or_b32_e32 v1, v75, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v160, 0x300, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v74, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v73, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v150
+; GFX11-NEXT:    v_add_nc_u32_e32 v26, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v72, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v146
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v145
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v135
+; GFX11-NEXT:    v_add_nc_u32_e32 v25, 0x300, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v63, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v131
+; GFX11-NEXT:    v_or_b32_e32 v0, v62, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v131, 0x300, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v60, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v61, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v24, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v118
+; GFX11-NEXT:    v_add_nc_u32_e32 v135, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v118, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v59, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v179
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v115
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v165
+; GFX11-NEXT:    v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v56, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v162
+; GFX11-NEXT:    v_or_b32_e32 v1, v45, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v44, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v42, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v151
+; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v41, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v149
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v148
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v144
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x300, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v180, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v133
+; GFX11-NEXT:    v_or_b32_e32 v0, v177, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v19, 0x300, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v166, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v167, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v133, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v129
+; GFX11-NEXT:    v_add_nc_u32_e32 v129, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v144, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v161, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v119
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v117
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v116
+; GFX11-NEXT:    v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v147, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v114
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v99
+; GFX11-NEXT:    v_add_nc_u32_e32 v114, 0x300, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v132, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v130, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v103
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v98
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v54
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v53
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v39
+; GFX11-NEXT:    v_add_nc_u32_e32 v33, 3, v33
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x300, v0
+; GFX11-NEXT:    v_or_b32_e32 v4, v113, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v128, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v100
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-NEXT:    v_or_b32_e32 v4, v101, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v102, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v96
+; GFX11-NEXT:    v_or_b32_e32 v1, v134, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v4, v97, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v55
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v52
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v87, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v51
+; GFX11-NEXT:    v_or_b32_e32 v4, v86, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v85, v6
+; GFX11-NEXT:    v_or_b32_e32 v6, v84, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v51, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v50
+; GFX11-NEXT:    v_add_nc_u32_e32 v50, 0x300, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v49
+; GFX11-NEXT:    v_or_b32_e32 v5, v83, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v48
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v7
+; GFX11-NEXT:    v_or_b32_e32 v4, v82, v4
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v38
+; GFX11-NEXT:    v_or_b32_e32 v5, v81, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v38, 0x300, v4
+; GFX11-NEXT:    v_or_b32_e32 v4, v71, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v80, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v37
+; GFX11-NEXT:    v_add_nc_u32_e32 v37, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v39, 0x300, v7
+; GFX11-NEXT:    v_or_b32_e32 v4, v70, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v36
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v34
+; GFX11-NEXT:    v_add_nc_u32_e32 v34, 3, v35
+; GFX11-NEXT:    v_add_nc_u32_e32 v35, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v7
+; GFX11-NEXT:    v_or_b32_e32 v5, v69, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v34
+; GFX11-NEXT:    v_or_b32_e32 v3, v112, v3
+; GFX11-NEXT:    v_or_b32_e32 v4, v68, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v67, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v66, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v33
+; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v32
+; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x300, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v65, v8
+; GFX11-NEXT:    v_and_b32_e64 v8, 0xffff, s4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v6
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_or_b32_e32 v4, v64, v4
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-NEXT:    v_lshl_or_b32 v10, v10, 16, v36
+; GFX11-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v33
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v37
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_lshl_or_b32 v6, v32, 16, v7
+; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v35
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v51
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v38
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-NEXT:    v_lshl_or_b32 v7, v34, 16, v8
+; GFX11-NEXT:    v_lshl_or_b32 v8, v39, 16, v33
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v12
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v11
+; GFX11-NEXT:    v_lshl_or_b32 v11, v50, 16, v32
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v1
+; GFX11-NEXT:    v_lshl_or_b32 v9, v9, 16, v35
+; GFX11-NEXT:    v_lshl_or_b32 v12, v15, 16, v14
+; GFX11-NEXT:    v_lshl_or_b32 v13, v13, 16, v33
+; GFX11-NEXT:    v_lshl_or_b32 v14, v3, 16, v34
+; GFX11-NEXT:    v_lshl_or_b32 v16, v16, 16, v32
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v116
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v129
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v18
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v17
+; GFX11-NEXT:    v_lshl_or_b32 v26, v26, 16, v36
+; GFX11-NEXT:    v_lshl_or_b32 v17, v114, 16, v32
+; GFX11-NEXT:    v_lshl_or_b32 v18, v144, 16, v33
+; GFX11-NEXT:    v_lshl_or_b32 v20, v20, 16, v34
+; GFX11-NEXT:    v_lshl_or_b32 v21, v21, 16, v35
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v115
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v135
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v131
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v23
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v27
+; GFX11-NEXT:    v_lshl_or_b32 v22, v145, 16, v32
+; GFX11-NEXT:    v_lshl_or_b32 v23, v118, 16, v33
+; GFX11-NEXT:    v_lshl_or_b32 v24, v24, 16, v34
+; GFX11-NEXT:    v_lshl_or_b32 v25, v25, 16, v35
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v163
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v182
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v181
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v28
+; GFX11-NEXT:    v_lshl_or_b32 v15, v2, 16, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    v_lshl_or_b32 v19, v133, 16, v19
+; GFX11-NEXT:    v_lshl_or_b32 v27, v160, 16, v32
+; GFX11-NEXT:    v_lshl_or_b32 v28, v178, 16, v33
+; GFX11-NEXT:    v_lshl_or_b32 v29, v29, 16, v34
+; GFX11-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
+; GFX11-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
+; GFX11-NEXT:  .LBB89_3: ; %end
+; GFX11-NEXT:    s_clause 0x1e
+; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:320
+; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:324
+; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:328
+; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:332
+; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:336
+; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:340
+; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:344
+; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:348
+; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:352
+; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:356
+; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:360
+; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:364
+; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:368
+; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:372
+; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:376
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:380
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:384
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:388
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:392
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:396
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:400
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:404
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:408
+; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:412
+; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:416
+; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:420
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:424
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:428
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:432
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:436
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:440
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-NEXT:  .LBB89_4:
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-NEXT:    s_branch .LBB89_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -173615,766 +169385,814 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v150, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v150, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v147, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v149, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v149, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v148, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v146, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v145, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v134, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v144, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:520
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:516
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:512
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:508
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:504
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:500
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:496
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:492
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:488
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:484
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:480
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:476
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:472
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:468
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:464
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:460
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:456
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:452
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:448
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:444
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:440
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:436
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:432
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:428
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:424
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:420
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:416
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:412
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:408
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:404
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:400
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:396
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:392 ; 4-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v144, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v135, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v132, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v135, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v134, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v133, off, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v160, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v103, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v113, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v114, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v116, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v116, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v117, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v117, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v118, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v118, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v119, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v128, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v129, off, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v130, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v114, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v103, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v128, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v56, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v117, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v102, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v133, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v57, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v112, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v58, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v132, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v101, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v118, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v59, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v134, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v97, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v113, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v100, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v145, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v99, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v119, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v60, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v135, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v98, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v129, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v61, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v146, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v62, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v115, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v63, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v144, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v130, off, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v131, off, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v132, off, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v133, off, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v151, off, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v151, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v96, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v130, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v147, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v84, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v116, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v87, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v148, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v86, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v131, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v73, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v65, off, s32 offset:388
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v53, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v66, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v67, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v68, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v69, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v70, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v71, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v80, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v81, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v82, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v83, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v85, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v74, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v75, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v76, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v77, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v78, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v79, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v88, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v89, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v90, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v91, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v92, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v93, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v95, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v104, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v179, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v163, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v183, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v150, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v167, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v161, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v177, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v149, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v180, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v151, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v164, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v41, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v47, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v165, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v43, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v181, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v45, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v162, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v46, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v176, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v42, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v178, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v44, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v160, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v182, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, v26.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v24.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.h, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v29.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v149.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v148.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v145.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v148.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v147.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v146.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v145.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v134.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v160
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v166, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v40, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v19.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v56.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v57.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.h, 8, v58.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v103.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v113.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.h, 8, v113.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v114.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.l, 8, v114.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.h, 8, v115.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v116.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v116.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v117.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v117.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v59.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.l, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v60.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.l, 8, v98.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v61.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.h, 8, v62.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.h, 8, v63.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.l, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v72.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.l, 8, v84.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.l, 8, v86.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v73.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v65
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.l, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.l, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.l, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v85.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.h, 8, v118.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v74.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.l, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.l, 8, v75.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v119.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v76.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.h, 8, v77.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v128.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.l, 8, v78.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.h, 8, v79.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v129.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v88.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v89.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v130.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.h, 8, v90.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v91.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.l, 8, v131.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v92.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v132.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v93.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v94.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.h, 8, v95.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v151.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v31.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v104.l
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB92_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
-; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB92_4
-; GFX11-TRUE16-NEXT:  .LBB92_2: ; %end
-; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB92_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v51.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v53.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v51.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v54.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v65.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v64.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v67.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB92_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v35.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v66.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v68.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v68.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v71.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v69.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v82.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v85.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v86.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v80.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v87.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v71.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v96.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v97.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v100.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v48.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v50.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v50.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v39.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v81.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v81.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v82.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v83.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v97.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v70.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v98.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v99.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v99.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v87.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v101.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v102.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v102.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v103.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v112.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v101.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v112.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v113.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v113.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v103.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v115.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v115.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v117.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v114.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v118.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v118.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v119.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v114.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v119.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v128.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v128.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v129.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v130.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v117.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v130.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v131.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v131.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v129.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v132.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v133.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v133.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v134.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v135.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v132.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v135.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v144.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v144.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v134.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v25.l, v145.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v146.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v26.l, v146.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v147.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v148.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v145.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v148.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v149.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v147.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v30.l, v150.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v150.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v31.l, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v151.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v40.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v166.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v182.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v160.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v44.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v178.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v42.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v176.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v46.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v162.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v45.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v181.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v43.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v165.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v47.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v41.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v164.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v180.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v149.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v177.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v161.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v167.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v183.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v163.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v179.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v148.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v116.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v147.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v115.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v129.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v119.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v113.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v134.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v118.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v132.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v133.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v117.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v128.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v49.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v50.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v53.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v54.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v65.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v55.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v67.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v67.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v68.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v69.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v66.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v70.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v70.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v66.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v71.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v80.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v80.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v82.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v69.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v82.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v83.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v81.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v84.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v85.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v85.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v87.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v96.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v86.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v25.l, v97.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v98.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v26.l, v98.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v99.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v97.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v100.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v99.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v30.l, v102.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v102.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v31.l, v103.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v103.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT:  .LBB92_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB92_2
-; GFX11-TRUE16-NEXT:  .LBB92_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v50.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v39.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v50.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v39.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v48.l, 3
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB92_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v128.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v114.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v133.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v117.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v145.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v150.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v150.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v148.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v103.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v102.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v102.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v100.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v49.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v48.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v132.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v112.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v134.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v118.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v149.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v147.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v148.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v149.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v129.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v101.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v99.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v100.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v101.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v38.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v37.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v113.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v135.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v98.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v145.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v146.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v147.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v145.h, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v97.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v98.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v99.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v97.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v148.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v144.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v115.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v147.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v130.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v135.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v144.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v134.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v135.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v144.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v87.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v96.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v86.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v87.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v96.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v116.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v100.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v179.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v131.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(24)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v100.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v98.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v183.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v163.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v132.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v133.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v134.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v132.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v133.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v84.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v85.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v86.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v84.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v85.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v97.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v85.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v167.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v150.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v96.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v86.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v177.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v161.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v87.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v131.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v129.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v130.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v131.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v180.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v83.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v81.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v82.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v83.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v71.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v149.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v84.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v80.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v164.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v151.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v96.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v130.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v47.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v82.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v117.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v128.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v129.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v119.h, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v86.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v69.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v80.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v81.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v71.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v41.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v85.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v80.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v43.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v165.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v84.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v82.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v45.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v181.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v128.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v119.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v114.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v118.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v118.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v80.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v71.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v66.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v70.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v70.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v83.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v46.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v69.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v162.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v71.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v69.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v42.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v176.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v70.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v44.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v117.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v114.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v116.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v116.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v115.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v69.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v66.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v68.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v68.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v67.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v68.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v178.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v68.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v66.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v182.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v160.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v67.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v66.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v40.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v166.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v115.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v113.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v103.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v112.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v113.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v67.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v65.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v55.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v64.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v65.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v39.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v54.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v65.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v64.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v38.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v112.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v101.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v102.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v103.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v101.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v64.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v53.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v55.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v53.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v64.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v37.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v52.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v55.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v53.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v35.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v102.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v99.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v87.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v98.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v99.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v54.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v50.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v51.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v52.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v54.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v36.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v49.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v53.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v51.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v52.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v51.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v97.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v70.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v82.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v83.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v81.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v81.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v51.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v48.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v49.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v50.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v48.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v49.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, 0x300, v32.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v32.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v33.l
+; GFX11-TRUE16-NEXT:  .LBB92_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:392
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:396
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:400
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:404
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:408
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:412
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:416
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:420
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:424
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:428
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:432
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:436
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:440
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:444
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:448
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:452
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:456
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:460
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:464
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:468
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:472
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:476
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:480
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:484
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:488
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:492
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:496
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:500
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:504
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:508
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:512
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:516
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:520 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16:
@@ -179049,1657 +174867,831 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:428
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:424
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:420
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:416
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:412
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:408
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:404
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:400
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:396
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:392
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:320
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v2, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v8, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v10, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v16, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v18, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v20, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v22, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v24, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v26, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v28, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v30, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v41, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v44, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v45, off, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v56, off, s32 offset:152
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v59, off, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v60, off, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v61, off, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v62, off, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v63, off, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v72, off, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v73, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v74, off, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v75, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v76, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v77, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v78, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v79, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v88, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v89, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v90, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v91, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v92, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v93, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v94, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v57, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v58, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v46, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v47, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v40, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v43, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v182, off, s32 offset:260
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v183, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v179, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v181, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v164, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v176, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v160, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v150, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v135, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v119, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v178, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v149, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v151, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v144, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v133, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v117, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v118, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v116, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v103, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v29
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 8, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v102, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v130, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v134, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v132, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v161, 8, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v147, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v166, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v167, 8, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v180, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v177, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v42, 8, v41
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v41, 8, v44
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(60)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v45, 8, v45
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v44, 8, v56
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(58)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v59, 8, v59
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v56, 8, v60
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v60, 8, v61
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v61, 8, v62
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v63, 8, v63
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v62, 8, v72
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v73, 8, v73
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v72, 8, v74
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v75, 8, v75
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v74, 8, v76
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v77, 8, v77
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v76, 8, v78
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v78, 8, v79
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v79, 8, v88
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v89, 8, v89
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v88, 8, v90
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v91, 8, v91
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v90, 8, v92
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v92, 8, v93
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v93, 8, v94
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB93_4
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v5, 0xffff, s5
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s11, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v68
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v4, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v48
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v7, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v8, v81
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v86
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v96
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v85
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v10, v97
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v11, v87
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v99
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v103
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v114
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v98
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v0, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v113
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v101
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v116
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v14, v128
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v112
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v117
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v102
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v130
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v133
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v14, v132
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v0, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v148
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v118
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xff, v129
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v161
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v166
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v144
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v134
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v18, v147
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v167
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v149
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v180
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v177
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v165
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v162
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v42
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v41
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v178
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v115
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v45
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v119
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v56
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v135
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v60
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v61
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v63
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v62
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v160
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v73
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v72
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v176
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v164
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v75
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v74
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v179
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v77
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v76
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v40
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v88
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v46
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v91
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v90
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v57
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v92
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v93
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB93_3
-; GFX11-TRUE16-NEXT:  .LBB93_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v58
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v57
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v92, v0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v46
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v93, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v91, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v43
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v40
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v90, v3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v183
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v182
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v89, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v181
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v88, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v181, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v78, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v79, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v29, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v179
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v182, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v179, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v77, v3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v176
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v164
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v163
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v163, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v76, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v160
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v75, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v160, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v74, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v73, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v150
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v72, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v146
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v145
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v135
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v63, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v131
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v62, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v131, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v60, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v61, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v24, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v119
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v135, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v119, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v59, v3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v178
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v115
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v165
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v115, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v56, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v162
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v45, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v145, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v44, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v42, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v151
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v41, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v149
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v148
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v144
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v180, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v133
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v177, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v166, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v167, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v133, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v129
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v144, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v161, v3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v118
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v117
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v116
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v116, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v147, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v99
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v132, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v130, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v103
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v98
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v54
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v39
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 3, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v113, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v128, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v100
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v101, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v102, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v96
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v134, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v97, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v87, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v86, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v85, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v84, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v50
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v49
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v83, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v82, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v81, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v71, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v80, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v70, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 3, v35
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v69, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v112, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v68, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v67, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v66, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v32
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v65, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v51
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v116
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v129
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v115
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v135
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v119, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v182
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v2, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v179, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
-; GFX11-TRUE16-NEXT:  .LBB93_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:392
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:396
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:400
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:404
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:408
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:412
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:416
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:420
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:424
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:428
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:432
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:436
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:440
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB93_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT:    s_branch .LBB93_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:428
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:424
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:420
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:416
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:412
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:408
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:404
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:400
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:396
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:392
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:388
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:320
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:112
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v41, off, s32 offset:128
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v44, off, s32 offset:136
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v45, off, s32 offset:144
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v56, off, s32 offset:152
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v59, off, s32 offset:160
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v60, off, s32 offset:168
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v61, off, s32 offset:176
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v62, off, s32 offset:184
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v63, off, s32 offset:192
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v72, off, s32 offset:200
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v73, off, s32 offset:208
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v74, off, s32 offset:216
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v75, off, s32 offset:224
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v76, off, s32 offset:232
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v77, off, s32 offset:240
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v78, off, s32 offset:248
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v79, off, s32 offset:256
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v88, off, s32 offset:264
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v89, off, s32 offset:272
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v90, off, s32 offset:280
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v91, off, s32 offset:288
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:296
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v93, off, s32 offset:304
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:312
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v57, off, s32 offset:308
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v58, off, s32 offset:300
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v46, off, s32 offset:292
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v47, off, s32 offset:284
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v40, off, s32 offset:276
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v43, off, s32 offset:268
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32 offset:260
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v178, off, s32 offset:244
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v181, off, s32 offset:236
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v164, off, s32 offset:228
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v176, off, s32 offset:220
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v160, off, s32 offset:212
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v163, off, s32 offset:204
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v146, off, s32 offset:196
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v150, off, s32 offset:188
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v135, off, s32 offset:180
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v145, off, s32 offset:172
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:164
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:156
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32 offset:148
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v179, off, s32 offset:140
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v162, off, s32 offset:132
-; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v149, off, s32 offset:116
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v151, off, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v144, off, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v148, off, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:84
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v133, off, s32 offset:76
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:68
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v119, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v114, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v80, 8, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 8, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 8, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v29
-; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 8, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 8, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 8, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 8, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 8, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 8, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v130, 8, v14
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 8, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 8, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 8, v20
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v161, 8, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 8, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v166, 8, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v167, 8, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v180, 8, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v177, 8, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v42, 8, v41
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v41, 8, v44
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v45, 8, v45
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v44, 8, v56
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v59, 8, v59
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v56, 8, v60
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v60, 8, v61
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v61, 8, v62
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v63, 8, v63
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v62, 8, v72
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v73, 8, v73
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v72, 8, v74
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v75, 8, v75
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v74, 8, v76
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v77, 8, v77
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v76, 8, v78
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v78, 8, v79
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v79, 8, v88
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v89, 8, v89
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v88, 8, v90
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v91, 8, v91
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v90, 8, v92
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v92, 8, v93
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v93, 8, v94
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB93_4
-; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT:    v_and_b32_e64 v5, 0xffff, s5
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s11, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v68
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v64
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v66
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v4, v67
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v65
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v39
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v50
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v71
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v48
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v69
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v82
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v7, v80
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v8, v81
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v53
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v55
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v51
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v84
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v52
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v54
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v86
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v83
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v96
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v85
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v10, v97
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v11, v87
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v99
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v103
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v114
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v98
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v0, 16, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v113
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v101
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v116
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v14, v128
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v112
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v117
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v102
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v130
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v133
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v14, v132
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v0, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v148
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v119
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v129
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v161
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v166
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v144
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v134
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v147
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v167
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v15
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s7
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v151
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v149
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v180
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v177
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v165
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v162
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v42
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v41
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v179
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v115
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v45
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v131
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v118
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v56
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v135
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v60
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v61
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v150
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v63
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v62
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v163
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v160
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v73
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v72
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v176
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v164
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v75
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v74
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v178
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v77
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v76
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v43
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v40
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v88
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v47
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v46
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v91
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v90
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v58
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v57
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v92
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v93
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB93_3
-; GFX11-FAKE16-NEXT:  .LBB93_2: ; %cmp.true
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v58
-; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v57
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v47
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v92, v0
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v46
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v93, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v43
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v40
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v90, v3
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v183
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v182
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v89, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v181
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v88, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v181, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v78, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v79, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v29, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v178
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v182, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v178, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v77, v3
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v176
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v164
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v163
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v163, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v76, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v160
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v75, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v160, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v74, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v73, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v150
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v72, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v146
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v145
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v135
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v63, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v131
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v62, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v131, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v60, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v61, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v118
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v135, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v118, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v59, v3
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v179
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v115
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v165
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v115, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v56, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v162
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v45, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v145, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v44, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v42, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v151
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v41, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v149
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v148
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v144
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v180, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v133
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v177, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v166, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v167, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v133, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v129
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v144, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v161, v3
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v119
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v117
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v116
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v116, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v147, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v114
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v99
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v132, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v130, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v103
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v98
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v54
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v39
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 3, v33
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v113, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v128, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v100
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v101, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v102, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v96
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v134, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v97, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v55
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v52
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v87, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v51
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v86, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v85, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v84, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v50
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v49
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v83, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v48
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v82, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v38
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v81, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v71, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v80, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v37
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v70, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v36
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 3, v35
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v69, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v112, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v68, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v67, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v66, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v33
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v32
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v65, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v64, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v33
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v51
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v38
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v116
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v129
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v17
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v115
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v135
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v131
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v27
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v118, 16, v33
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v163
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v182
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v181
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v28
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v2, 16, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v178, 16, v33
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
-; GFX11-FAKE16-NEXT:  .LBB93_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:388
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:392
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:396
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:400
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:404
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:408
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:412
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:416
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:420
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:424
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:428
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:432
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:436
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:440
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB93_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT:    s_branch .LBB93_2
+; GFX11-LABEL: bitcast_v128i8_to_v64f16_scalar:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1e
+; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:440
+; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:436
+; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:432
+; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:428
+; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:424
+; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:420
+; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:416
+; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:412
+; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:408
+; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:404
+; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:400
+; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:396
+; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:392
+; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:388
+; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:384
+; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:380
+; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:376
+; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:372
+; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:368
+; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:364
+; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:360
+; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:356
+; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:352
+; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:348
+; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:344
+; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:340
+; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:336
+; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:332
+; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:328
+; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:324
+; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:320
+; GFX11-NEXT:    v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
+; GFX11-NEXT:    v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
+; GFX11-NEXT:    v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
+; GFX11-NEXT:    v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-NEXT:    v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
+; GFX11-NEXT:    v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
+; GFX11-NEXT:    v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:316
+; GFX11-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u16 v16, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u16 v18, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_u16 v20, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_u16 v22, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_u16 v24, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_u16 v26, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_u16 v28, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_u16 v30, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_u16 v31, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_u16 v41, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_u16 v44, off, s32 offset:136
+; GFX11-NEXT:    scratch_load_u16 v45, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_u16 v56, off, s32 offset:152
+; GFX11-NEXT:    scratch_load_u16 v59, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_u16 v60, off, s32 offset:168
+; GFX11-NEXT:    scratch_load_u16 v61, off, s32 offset:176
+; GFX11-NEXT:    scratch_load_u16 v62, off, s32 offset:184
+; GFX11-NEXT:    scratch_load_u16 v63, off, s32 offset:192
+; GFX11-NEXT:    scratch_load_u16 v72, off, s32 offset:200
+; GFX11-NEXT:    scratch_load_u16 v73, off, s32 offset:208
+; GFX11-NEXT:    scratch_load_u16 v74, off, s32 offset:216
+; GFX11-NEXT:    scratch_load_u16 v75, off, s32 offset:224
+; GFX11-NEXT:    scratch_load_u16 v76, off, s32 offset:232
+; GFX11-NEXT:    scratch_load_u16 v77, off, s32 offset:240
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_u16 v78, off, s32 offset:248
+; GFX11-NEXT:    scratch_load_u16 v79, off, s32 offset:256
+; GFX11-NEXT:    scratch_load_u16 v88, off, s32 offset:264
+; GFX11-NEXT:    scratch_load_u16 v89, off, s32 offset:272
+; GFX11-NEXT:    scratch_load_u16 v90, off, s32 offset:280
+; GFX11-NEXT:    scratch_load_u16 v91, off, s32 offset:288
+; GFX11-NEXT:    scratch_load_u16 v92, off, s32 offset:296
+; GFX11-NEXT:    scratch_load_u16 v93, off, s32 offset:304
+; GFX11-NEXT:    scratch_load_u16 v94, off, s32 offset:312
+; GFX11-NEXT:    scratch_load_u16 v57, off, s32 offset:308
+; GFX11-NEXT:    scratch_load_u16 v58, off, s32 offset:300
+; GFX11-NEXT:    scratch_load_u16 v46, off, s32 offset:292
+; GFX11-NEXT:    scratch_load_u16 v47, off, s32 offset:284
+; GFX11-NEXT:    scratch_load_u16 v40, off, s32 offset:276
+; GFX11-NEXT:    scratch_load_u16 v43, off, s32 offset:268
+; GFX11-NEXT:    scratch_load_u16 v182, off, s32 offset:260
+; GFX11-NEXT:    scratch_load_u16 v183, off, s32 offset:252
+; GFX11-NEXT:    scratch_load_u16 v178, off, s32 offset:244
+; GFX11-NEXT:    scratch_load_u16 v181, off, s32 offset:236
+; GFX11-NEXT:    scratch_load_u16 v164, off, s32 offset:228
+; GFX11-NEXT:    scratch_load_u16 v176, off, s32 offset:220
+; GFX11-NEXT:    scratch_load_u16 v160, off, s32 offset:212
+; GFX11-NEXT:    scratch_load_u16 v163, off, s32 offset:204
+; GFX11-NEXT:    scratch_load_u16 v146, off, s32 offset:196
+; GFX11-NEXT:    scratch_load_u16 v150, off, s32 offset:188
+; GFX11-NEXT:    scratch_load_u16 v135, off, s32 offset:180
+; GFX11-NEXT:    scratch_load_u16 v145, off, s32 offset:172
+; GFX11-NEXT:    scratch_load_u16 v118, off, s32 offset:164
+; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_u16 v115, off, s32 offset:148
+; GFX11-NEXT:    scratch_load_u16 v179, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_u16 v162, off, s32 offset:132
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_load_u16 v165, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_u16 v149, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_u16 v151, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_u16 v144, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_u16 v148, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_u16 v133, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_u16 v119, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u16 v114, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u16 v100, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u16 v98, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 8, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 8, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 8, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 8, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 8, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 8, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 8, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 8, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 8, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 8, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 8, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 8, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 8, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 8, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 8, v29
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v97, 8, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v87, 8, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v101, 8, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v102, 8, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v113, 8, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v112, 8, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v130, 8, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v128, 8, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v134, 8, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v132, 8, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v161, 8, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v147, 8, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v166, 8, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v167, 8, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v180, 8, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v177, 8, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v42, 8, v41
+; GFX11-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v41, 8, v44
+; GFX11-NEXT:    s_waitcnt vmcnt(60)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v45, 8, v45
+; GFX11-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v44, 8, v56
+; GFX11-NEXT:    s_waitcnt vmcnt(58)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v59, 8, v59
+; GFX11-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v56, 8, v60
+; GFX11-NEXT:    s_waitcnt vmcnt(56)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v60, 8, v61
+; GFX11-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v61, 8, v62
+; GFX11-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v63, 8, v63
+; GFX11-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v62, 8, v72
+; GFX11-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v73, 8, v73
+; GFX11-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v72, 8, v74
+; GFX11-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v75, 8, v75
+; GFX11-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v74, 8, v76
+; GFX11-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v77, 8, v77
+; GFX11-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v76, 8, v78
+; GFX11-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v78, 8, v79
+; GFX11-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v79, 8, v88
+; GFX11-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v89, 8, v89
+; GFX11-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v88, 8, v90
+; GFX11-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v91, 8, v91
+; GFX11-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v90, 8, v92
+; GFX11-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v92, 8, v93
+; GFX11-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v93, 8, v94
+; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cbranch_scc0 .LBB93_4
+; GFX11-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-NEXT:    s_and_b32 s5, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s29, 8
+; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-NEXT:    v_and_b32_e64 v5, 0xffff, s5
+; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-NEXT:    s_and_b32 s11, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s12, s27, 8
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-NEXT:    s_pack_ll_b32_b16 s8, s9, s10
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v36
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v32
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v34
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v33
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v68
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v64
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v66
+; GFX11-NEXT:    v_or_b32_e32 v6, v4, v67
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v65
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshl_or_b32 v4, v0, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v38
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v39
+; GFX11-NEXT:    v_lshl_or_b32 v6, v6, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v49
+; GFX11-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v37
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v70
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v50
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v71
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v48
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v69
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v82
+; GFX11-NEXT:    v_or_b32_e32 v9, v7, v80
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, v8, v81
+; GFX11-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v53
+; GFX11-NEXT:    v_lshl_or_b32 v8, v9, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v55
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v51
+; GFX11-NEXT:    v_lshl_or_b32 v9, v10, 16, v3
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v84
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v52
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v54
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v86
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v83
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v96
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v85
+; GFX11-NEXT:    v_or_b32_e32 v12, v10, v97
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v11, v87
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v99
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-NEXT:    v_lshl_or_b32 v11, v3, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v103
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v114
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v98
+; GFX11-NEXT:    v_lshl_or_b32 v12, v0, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v100
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v113
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v101
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v116
+; GFX11-NEXT:    v_or_b32_e32 v17, v14, v128
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v112
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v117
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v102
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_or_b32_e32 v13, v13, v130
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v133
+; GFX11-NEXT:    v_or_b32_e32 v20, v14, v132
+; GFX11-NEXT:    v_lshl_or_b32 v14, v0, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v148
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v119
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v129
+; GFX11-NEXT:    v_or_b32_e32 v16, v16, v161
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-NEXT:    v_lshl_or_b32 v13, v2, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v166
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v144
+; GFX11-NEXT:    v_or_b32_e32 v15, v15, v134
+; GFX11-NEXT:    v_or_b32_e32 v18, v18, v147
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v167
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v15
+; GFX11-NEXT:    v_lshl_or_b32 v15, v17, 16, v19
+; GFX11-NEXT:    v_lshl_or_b32 v17, v18, 16, v22
+; GFX11-NEXT:    v_mov_b32_e32 v2, s7
+; GFX11-NEXT:    v_lshl_or_b32 v18, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v151
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v149
+; GFX11-NEXT:    v_lshl_or_b32 v16, v20, 16, v21
+; GFX11-NEXT:    v_mov_b32_e32 v3, s8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v180
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v177
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v19, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v165
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v162
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v42
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v41
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v20, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v179
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v115
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v45
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v44
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v21, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v131
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v118
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v59
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v56
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v22, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v145
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v135
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v60
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v61
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v23, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v150
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v146
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v63
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v62
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v24, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v163
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v160
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v73
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v72
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v25, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v176
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v164
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v75
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v74
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v26, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v181
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v178
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v77
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v76
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v27, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v183
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v182
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v78
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v79
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v28, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v43
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v40
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v89
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v88
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v29, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v47
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v46
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v91
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v90
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v58
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v57
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v92
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v93
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_vccnz .LBB93_3
+; GFX11-NEXT:  .LBB93_2: ; %cmp.true
+; GFX11-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v58
+; GFX11-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v57
+; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v47
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v92, v0
+; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v46
+; GFX11-NEXT:    v_or_b32_e32 v1, v93, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v91, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v43
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x300, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v40
+; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x300, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v4
+; GFX11-NEXT:    v_or_b32_e32 v3, v90, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v183
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v182
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v89, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v30, 0x300, v3
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v181
+; GFX11-NEXT:    v_or_b32_e32 v0, v88, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v78, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v79, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v29, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v178
+; GFX11-NEXT:    v_add_nc_u32_e32 v182, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v178, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v77, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v176
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v164
+; GFX11-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v163
+; GFX11-NEXT:    v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v76, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v160
+; GFX11-NEXT:    v_or_b32_e32 v1, v75, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v160, 0x300, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v74, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v73, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v150
+; GFX11-NEXT:    v_add_nc_u32_e32 v26, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v72, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v146
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v145
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v135
+; GFX11-NEXT:    v_add_nc_u32_e32 v25, 0x300, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v63, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v131
+; GFX11-NEXT:    v_or_b32_e32 v0, v62, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v131, 0x300, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v60, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v61, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v24, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v118
+; GFX11-NEXT:    v_add_nc_u32_e32 v135, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v118, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v59, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v179
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v115
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v165
+; GFX11-NEXT:    v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v56, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v162
+; GFX11-NEXT:    v_or_b32_e32 v1, v45, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v44, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v42, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v151
+; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v41, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v149
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v148
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v144
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x300, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v180, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v133
+; GFX11-NEXT:    v_or_b32_e32 v0, v177, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v19, 0x300, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v166, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v167, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v133, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v129
+; GFX11-NEXT:    v_add_nc_u32_e32 v129, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v144, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v161, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v119
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v117
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v116
+; GFX11-NEXT:    v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v147, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v114
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v99
+; GFX11-NEXT:    v_add_nc_u32_e32 v114, 0x300, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v132, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v130, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v103
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v98
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v54
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v53
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v39
+; GFX11-NEXT:    v_add_nc_u32_e32 v33, 3, v33
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x300, v0
+; GFX11-NEXT:    v_or_b32_e32 v4, v113, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v128, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v100
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-NEXT:    v_or_b32_e32 v4, v101, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v102, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v96
+; GFX11-NEXT:    v_or_b32_e32 v1, v134, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v4, v97, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v55
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v52
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v87, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v51
+; GFX11-NEXT:    v_or_b32_e32 v4, v86, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v85, v6
+; GFX11-NEXT:    v_or_b32_e32 v6, v84, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v51, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v50
+; GFX11-NEXT:    v_add_nc_u32_e32 v50, 0x300, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v49
+; GFX11-NEXT:    v_or_b32_e32 v5, v83, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v48
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v7
+; GFX11-NEXT:    v_or_b32_e32 v4, v82, v4
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v38
+; GFX11-NEXT:    v_or_b32_e32 v5, v81, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v38, 0x300, v4
+; GFX11-NEXT:    v_or_b32_e32 v4, v71, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v80, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v37
+; GFX11-NEXT:    v_add_nc_u32_e32 v37, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v39, 0x300, v7
+; GFX11-NEXT:    v_or_b32_e32 v4, v70, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v36
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v34
+; GFX11-NEXT:    v_add_nc_u32_e32 v34, 3, v35
+; GFX11-NEXT:    v_add_nc_u32_e32 v35, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v7
+; GFX11-NEXT:    v_or_b32_e32 v5, v69, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v34
+; GFX11-NEXT:    v_or_b32_e32 v3, v112, v3
+; GFX11-NEXT:    v_or_b32_e32 v4, v68, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v67, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v66, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v33
+; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v32
+; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x300, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v65, v8
+; GFX11-NEXT:    v_and_b32_e64 v8, 0xffff, s4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v6
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_or_b32_e32 v4, v64, v4
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-NEXT:    v_lshl_or_b32 v10, v10, 16, v36
+; GFX11-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v33
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v37
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_lshl_or_b32 v6, v32, 16, v7
+; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v35
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v51
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v38
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-NEXT:    v_lshl_or_b32 v7, v34, 16, v8
+; GFX11-NEXT:    v_lshl_or_b32 v8, v39, 16, v33
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v12
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v11
+; GFX11-NEXT:    v_lshl_or_b32 v11, v50, 16, v32
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v1
+; GFX11-NEXT:    v_lshl_or_b32 v9, v9, 16, v35
+; GFX11-NEXT:    v_lshl_or_b32 v12, v15, 16, v14
+; GFX11-NEXT:    v_lshl_or_b32 v13, v13, 16, v33
+; GFX11-NEXT:    v_lshl_or_b32 v14, v3, 16, v34
+; GFX11-NEXT:    v_lshl_or_b32 v16, v16, 16, v32
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v116
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v129
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v18
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v17
+; GFX11-NEXT:    v_lshl_or_b32 v26, v26, 16, v36
+; GFX11-NEXT:    v_lshl_or_b32 v17, v114, 16, v32
+; GFX11-NEXT:    v_lshl_or_b32 v18, v144, 16, v33
+; GFX11-NEXT:    v_lshl_or_b32 v20, v20, 16, v34
+; GFX11-NEXT:    v_lshl_or_b32 v21, v21, 16, v35
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v115
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v135
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v131
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v23
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v27
+; GFX11-NEXT:    v_lshl_or_b32 v22, v145, 16, v32
+; GFX11-NEXT:    v_lshl_or_b32 v23, v118, 16, v33
+; GFX11-NEXT:    v_lshl_or_b32 v24, v24, 16, v34
+; GFX11-NEXT:    v_lshl_or_b32 v25, v25, 16, v35
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v163
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v182
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v181
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v28
+; GFX11-NEXT:    v_lshl_or_b32 v15, v2, 16, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    v_lshl_or_b32 v19, v133, 16, v19
+; GFX11-NEXT:    v_lshl_or_b32 v27, v160, 16, v32
+; GFX11-NEXT:    v_lshl_or_b32 v28, v178, 16, v33
+; GFX11-NEXT:    v_lshl_or_b32 v29, v29, 16, v34
+; GFX11-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
+; GFX11-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
+; GFX11-NEXT:  .LBB93_3: ; %end
+; GFX11-NEXT:    s_clause 0x1e
+; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:320
+; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:324
+; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:328
+; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:332
+; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:336
+; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:340
+; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:344
+; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:348
+; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:352
+; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:356
+; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:360
+; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:364
+; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:368
+; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:372
+; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:376
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:380
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:384
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:388
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:392
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:396
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:400
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:404
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:408
+; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:412
+; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:416
+; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:420
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:424
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:428
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:432
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:436
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:440
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-NEXT:  .LBB93_4:
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-NEXT:    s_branch .LBB93_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -196187,766 +191179,814 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v150, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v150, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v147, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v149, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v149, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v148, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v146, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v145, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v134, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v144, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:520
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:516
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:512
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:508
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:504
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:500
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:496
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:492
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:488
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:484
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:480
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:476
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:472
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:468
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:464
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:460
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:456
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:452
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:448
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:444
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:440
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:436
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:432
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:428
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:424
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:420
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:416
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:412
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:408
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:404
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:400
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:396
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:392 ; 4-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v144, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v135, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v132, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v135, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v134, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v133, off, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v160, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v103, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v113, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v114, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v116, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v116, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v117, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v117, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v118, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v118, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v119, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v128, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v129, off, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v130, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v114, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v103, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v128, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v56, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v117, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v102, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v133, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v57, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v112, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v58, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v132, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v101, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v118, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v59, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v134, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v97, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v113, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v100, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v145, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v99, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v119, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v60, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v135, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v98, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v129, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v61, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v146, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v62, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v115, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v63, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v144, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v130, off, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v131, off, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v132, off, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v133, off, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v151, off, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v151, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v96, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v130, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v147, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v84, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v116, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v87, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v148, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v86, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v131, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v73, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v65, off, s32 offset:388
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v53, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v66, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v67, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v68, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v69, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v70, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v71, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v80, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v81, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v82, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v83, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v85, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v74, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v75, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v76, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v77, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v78, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v79, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v88, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v89, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v90, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v91, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v92, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v93, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v95, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v104, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v179, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v163, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v183, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v150, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v167, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v161, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v177, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v149, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v180, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v151, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v164, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v41, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v47, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v165, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v43, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v181, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v45, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v162, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v46, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v176, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v42, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v178, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v44, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v160, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v182, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v30.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v28.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, v26.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v24.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.h, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v29.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v149.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v148.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v145.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v148.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v147.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v146.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v145.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v134.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v160
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v166, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v40, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v19.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v56.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v57.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.h, 8, v58.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v103.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v113.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.h, 8, v113.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v114.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.l, 8, v114.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.h, 8, v115.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v116.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v116.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v117.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v117.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v59.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.l, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.l, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v60.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.l, 8, v98.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v61.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.h, 8, v62.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.h, 8, v63.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.l, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v72.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.l, 8, v84.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.l, 8, v86.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.h, 8, v73.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v65
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.l, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.l, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v67.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.l, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.l, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v85.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.h, 8, v118.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.h, 8, v74.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.l, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.l, 8, v75.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v119.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v66.h, 8, v76.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v71.h, 8, v77.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v128.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.l, 8, v78.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v80.h, 8, v79.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v129.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v88.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v89.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v130.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.h, 8, v90.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v91.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.l, 8, v131.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v92.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v132.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.h, 8, v93.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v94.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v84.h, 8, v95.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v151.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v31.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v31.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v85.l, 8, v104.l
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB96_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
-; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB96_4
-; GFX11-TRUE16-NEXT:  .LBB96_2: ; %end
-; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB96_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v51.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v53.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v51.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v54.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v55.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v65.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v64.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v67.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB96_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v35.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v66.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v68.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v68.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v71.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v69.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v82.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v85.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v86.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v84.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v80.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v87.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v71.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v96.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v97.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v100.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v48.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v50.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v50.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v39.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v81.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v81.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v82.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v83.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v97.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v70.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v98.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v99.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v99.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v87.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v101.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v102.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v102.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v103.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v112.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v101.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v112.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v113.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v113.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v103.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v115.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v115.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v117.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v114.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v118.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v118.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v119.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v114.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v119.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v128.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v128.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v129.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v130.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v117.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v130.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v131.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v131.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v129.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v132.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v133.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v133.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v134.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v135.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v132.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v135.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v144.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v144.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v134.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v25.l, v145.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v146.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v26.l, v146.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v147.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v148.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v145.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v148.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v149.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v149.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v147.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v30.l, v150.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v150.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v31.l, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v151.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v40.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v166.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v182.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v160.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v44.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v178.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v42.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v176.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v46.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v162.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v45.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v181.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v43.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v165.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v47.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v41.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v164.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v180.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v149.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v177.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v161.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v167.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v183.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v163.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v179.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v148.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v116.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v147.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v115.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v129.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v119.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v113.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v134.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v118.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v132.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v133.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v117.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v128.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v49.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v50.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v53.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v54.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v65.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v55.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v67.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v67.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v68.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v69.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v66.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v70.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v70.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v66.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v71.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v80.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v80.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v82.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v69.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v82.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v83.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v81.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v84.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v85.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v85.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v86.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v84.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v87.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v96.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v86.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v25.l, v97.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v98.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v26.l, v98.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v99.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v97.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v100.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v99.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v30.l, v102.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v102.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v31.l, v103.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v103.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT:  .LBB96_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB96_2
-; GFX11-TRUE16-NEXT:  .LBB96_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v50.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v39.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v50.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v39.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v48.l, 3
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB96_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v128.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v114.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v133.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v117.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v145.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v150.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v150.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v148.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v103.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v102.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v102.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v100.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v49.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v48.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v132.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v112.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v134.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v118.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v149.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v147.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v148.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v149.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v129.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v101.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v99.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v100.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v101.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v38.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v37.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v113.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v135.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v98.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v145.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v146.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v147.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v145.h, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v97.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v98.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v99.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v97.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v148.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v144.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v115.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v147.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v130.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v135.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v144.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v134.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v135.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v144.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v87.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v96.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v86.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v87.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v96.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v116.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v100.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v179.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v131.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(24)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v100.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v98.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v183.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v163.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v132.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v133.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v134.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v132.h, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v133.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v84.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v85.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v86.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v84.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v85.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v97.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v85.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v167.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v150.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v96.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v86.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v177.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v161.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v87.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v131.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v129.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v130.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v131.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v180.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v83.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v81.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v82.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v83.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v71.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v149.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v84.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v80.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v164.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v151.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v96.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v130.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v47.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v82.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v117.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v128.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v129.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v119.h, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v86.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v69.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v80.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v81.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v71.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v41.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v85.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v80.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v43.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v165.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v84.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v82.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v45.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v181.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v128.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v119.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v114.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v118.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v118.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v80.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v71.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v66.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v70.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v70.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v83.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v46.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v69.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v162.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v71.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v69.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v42.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v176.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v70.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v44.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v117.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v114.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v116.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v116.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v115.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v69.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v66.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v68.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v68.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v67.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v68.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v178.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v68.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v66.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v182.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v160.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v67.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v66.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v40.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v166.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v115.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v113.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v103.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v112.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v113.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v67.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v65.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v55.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v64.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v65.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v39.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v54.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v65.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v64.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v38.h, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v112.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v101.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v102.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v103.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v101.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v64.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v53.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v55.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v53.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v64.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v37.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v52.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v55.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v53.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v35.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v102.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v99.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v87.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v98.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v99.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v54.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v50.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v51.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v52.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v54.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v36.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v49.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v53.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v51.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v52.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v51.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v97.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v70.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v82.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v83.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v81.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v81.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v51.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v48.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v49.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v50.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v48.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v49.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, 0x300, v32.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v32.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v33.l
+; GFX11-TRUE16-NEXT:  .LBB96_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:392
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:396
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:400
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:404
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:408
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:412
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:416
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:420
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:424
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:428
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:432
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:436
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:440
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:444
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:448
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:452
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:456
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:460
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:464
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:468
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:472
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:476
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:480
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:484
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:488
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:492
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:496
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:500
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:504
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:508
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:512
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:516
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:520 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16:
@@ -201554,1657 +196594,831 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:428
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:424
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:420
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:416
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:412
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:408
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:404
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:400
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:396
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:392
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:320
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:316
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v2, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v8, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v10, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v16, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v18, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v20, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v22, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v24, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v26, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v28, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v30, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v41, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v44, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v45, off, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v56, off, s32 offset:152
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v59, off, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v60, off, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v61, off, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v62, off, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v63, off, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v72, off, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v73, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v74, off, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v75, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v76, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v77, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v78, off, s32 offset:248
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v79, off, s32 offset:256
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v88, off, s32 offset:264
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v89, off, s32 offset:272
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v90, off, s32 offset:280
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v91, off, s32 offset:288
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v92, off, s32 offset:296
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v93, off, s32 offset:304
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v94, off, s32 offset:312
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v57, off, s32 offset:308
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v58, off, s32 offset:300
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v46, off, s32 offset:292
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v47, off, s32 offset:284
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v40, off, s32 offset:276
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v43, off, s32 offset:268
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v182, off, s32 offset:260
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v183, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v179, off, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v181, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v164, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v176, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v160, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v150, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v135, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v119, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v178, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v149, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v151, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v144, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v133, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v117, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v118, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v116, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v103, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v29
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 8, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v101, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v102, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v113, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v112, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v130, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v128, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v134, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v132, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v161, 8, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v147, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v166, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v167, 8, v28
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v180, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v177, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v42, 8, v41
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v41, 8, v44
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(60)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v45, 8, v45
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v44, 8, v56
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(58)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v59, 8, v59
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v56, 8, v60
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v60, 8, v61
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v61, 8, v62
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v63, 8, v63
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v62, 8, v72
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v73, 8, v73
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v72, 8, v74
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v75, 8, v75
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v74, 8, v76
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v77, 8, v77
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v76, 8, v78
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v78, 8, v79
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v79, 8, v88
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v89, 8, v89
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v88, 8, v90
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v91, 8, v91
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v90, 8, v92
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v92, 8, v93
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v93, 8, v94
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB97_4
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v5, 0xffff, s5
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s11, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v68
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v4, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v48
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v7, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v8, v81
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v86
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v96
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v85
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v10, v97
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v11, v87
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v99
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v103
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v114
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v98
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v0, 16, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v113
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v101
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v116
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v14, v128
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v112
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v117
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v102
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v130
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v133
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v14, v132
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v0, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v148
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v118
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xff, v129
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v161
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v166
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v144
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v134
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v18, v147
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v167
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v149
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v180
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v177
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v165
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v162
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v42
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v41
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v178
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v115
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v45
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v119
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v56
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v135
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v60
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v61
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v63
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v62
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v160
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v73
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v72
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v176
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v164
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v75
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v74
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v179
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v77
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v76
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v40
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v88
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v46
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v91
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v90
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v57
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v92
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v93
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB97_3
-; GFX11-TRUE16-NEXT:  .LBB97_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v58
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v57
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v92, v0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v46
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v93, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v91, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v43
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v40
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v90, v3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v183
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v182
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v89, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v181
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v88, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v181, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v78, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v79, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v29, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v179
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v182, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v179, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v77, v3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v176
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v164
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v163
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v163, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v76, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v160
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v75, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v160, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v74, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v73, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v150
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v72, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v146
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v145
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v135
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v63, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v131
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v62, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v131, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v60, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v61, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v24, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v119
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v135, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v119, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v59, v3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v178
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v115
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v165
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v115, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v56, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v162
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v45, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v145, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v44, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v42, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v151
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v41, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v149
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v148
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v144
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v180, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v133
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v177, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v166, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v167, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v133, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v129
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v144, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v161, v3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v118
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v117
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v116
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v116, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v147, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v99
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v132, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v130, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v103
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v98
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v54
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v39
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 3, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v113, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v128, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v100
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v101, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v102, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v96
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v134, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v97, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v87, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v86, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v85, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v84, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v50
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v49
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v83, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v82, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v81, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v71, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v80, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v70, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 3, v35
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v69, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v112, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v68, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v67, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v66, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v32
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v65, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v51
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v116
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v129
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v115
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v135
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v119, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v182
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v2, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v179, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
-; GFX11-TRUE16-NEXT:  .LBB97_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:332
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:336
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:340
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:344
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:348
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:352
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:356
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:360
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:364
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:368
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:372
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:376
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:380
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:384
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:388
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:392
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:396
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:400
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:404
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:408
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:412
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:416
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:420
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:424
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:428
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:432
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:436
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:440
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB97_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT:    s_branch .LBB97_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:428
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:424
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:420
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:416
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:412
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:408
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:404
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:400
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:396
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:392
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:388
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:320
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:112
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v41, off, s32 offset:128
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v44, off, s32 offset:136
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v45, off, s32 offset:144
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v56, off, s32 offset:152
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v59, off, s32 offset:160
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v60, off, s32 offset:168
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v61, off, s32 offset:176
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v62, off, s32 offset:184
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v63, off, s32 offset:192
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v72, off, s32 offset:200
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v73, off, s32 offset:208
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v74, off, s32 offset:216
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v75, off, s32 offset:224
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v76, off, s32 offset:232
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v77, off, s32 offset:240
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v78, off, s32 offset:248
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v79, off, s32 offset:256
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v88, off, s32 offset:264
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v89, off, s32 offset:272
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v90, off, s32 offset:280
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v91, off, s32 offset:288
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:296
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v93, off, s32 offset:304
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:312
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v57, off, s32 offset:308
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v58, off, s32 offset:300
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v46, off, s32 offset:292
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v47, off, s32 offset:284
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v40, off, s32 offset:276
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v43, off, s32 offset:268
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32 offset:260
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v178, off, s32 offset:244
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v181, off, s32 offset:236
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v164, off, s32 offset:228
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v176, off, s32 offset:220
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v160, off, s32 offset:212
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v163, off, s32 offset:204
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v146, off, s32 offset:196
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v150, off, s32 offset:188
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v135, off, s32 offset:180
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v145, off, s32 offset:172
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:164
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:156
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32 offset:148
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v179, off, s32 offset:140
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v162, off, s32 offset:132
-; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v149, off, s32 offset:116
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v151, off, s32 offset:108
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v144, off, s32 offset:100
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v148, off, s32 offset:92
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:84
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v133, off, s32 offset:76
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:68
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v119, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v114, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v80, 8, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 8, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 8, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v29
-; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 8, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 8, v4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 8, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 8, v8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 8, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 8, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v130, 8, v14
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 8, v16
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 8, v18
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 8, v20
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v161, 8, v22
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 8, v24
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v166, 8, v26
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v167, 8, v28
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v180, 8, v30
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v177, 8, v31
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v42, 8, v41
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v41, 8, v44
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v45, 8, v45
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v44, 8, v56
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v59, 8, v59
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v56, 8, v60
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v60, 8, v61
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v61, 8, v62
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v63, 8, v63
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v62, 8, v72
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v73, 8, v73
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v72, 8, v74
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v75, 8, v75
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v74, 8, v76
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v77, 8, v77
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v76, 8, v78
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v78, 8, v79
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v79, 8, v88
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v89, 8, v89
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v88, 8, v90
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v91, 8, v91
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v90, 8, v92
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v92, 8, v93
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v93, 8, v94
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB97_4
-; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT:    v_and_b32_e64 v5, 0xffff, s5
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s11, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v68
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v64
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v66
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v4, v67
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v65
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v39
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v50
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v71
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v48
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v69
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v82
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v7, v80
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v8, v81
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v53
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v55
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v51
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v84
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v52
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v54
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v86
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v83
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v96
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v85
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v10, v97
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v11, v87
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v99
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v103
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v114
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v98
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v0, 16, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v113
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v101
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v116
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v14, v128
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v112
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v117
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v102
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v130
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v133
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v14, v132
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v0, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v148
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v119
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v129
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v161
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v166
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v144
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v134
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v147
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v167
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v15
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s7
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v151
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v149
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v180
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v177
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v165
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v162
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v42
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v41
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v179
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v115
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v45
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v131
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v118
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v56
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v135
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v60
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v61
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v150
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v63
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v62
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v163
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v160
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v73
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v72
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v176
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v164
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v75
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v74
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v178
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v77
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v76
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v43
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v40
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v88
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v47
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v46
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v91
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v90
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v58
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v57
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v92
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v93
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB97_3
-; GFX11-FAKE16-NEXT:  .LBB97_2: ; %cmp.true
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v58
-; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v57
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v47
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v92, v0
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v46
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v93, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v43
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v40
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v90, v3
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v183
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v182
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v89, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v181
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v88, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v181, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v78, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v79, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v29, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v178
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v182, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v178, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v77, v3
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v176
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v164
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v163
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v163, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v76, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v160
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v75, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v160, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v74, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v73, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v150
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v72, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v146
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v145
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v135
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v63, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v131
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v62, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v131, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v60, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v61, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v118
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v135, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v118, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v59, v3
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v179
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v115
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v165
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v115, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v56, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v162
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v45, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v145, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v44, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v42, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v151
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v41, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v149
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v148
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v144
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v180, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v133
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v177, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v166, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v167, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v133, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v129
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v144, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v161, v3
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v119
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v117
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v116
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v116, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v147, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v114
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v99
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v132, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v130, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v103
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v98
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v54
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v39
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 3, v33
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v113, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v128, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v100
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v101, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v102, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v96
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v134, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v97, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v55
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v52
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v87, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v51
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v86, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v85, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v84, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v50
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v49
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v83, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v48
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v82, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v38
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v81, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v71, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v80, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v37
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v70, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v36
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 3, v35
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v69, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v112, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v68, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v67, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v66, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v33
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v32
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v65, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v64, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v33
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v51
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v38
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v116
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v129
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v17
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v115
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v135
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v131
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v27
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v118, 16, v33
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v163
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v182
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v181
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v28
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v2, 16, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v178, 16, v33
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
-; GFX11-FAKE16-NEXT:  .LBB97_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:332
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:336
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:340
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:344
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:348
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:352
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:356
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:360
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:364
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:368
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:372
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:376
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:380
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:384
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:388
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:392
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:396
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:400
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:404
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:408
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:412
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:416
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:420
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:424
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:428
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:432
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:436
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:440
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB97_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT:    s_branch .LBB97_2
+; GFX11-LABEL: bitcast_v128i8_to_v64i16_scalar:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1e
+; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:440
+; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:436
+; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:432
+; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:428
+; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:424
+; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:420
+; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:416
+; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:412
+; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:408
+; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:404
+; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:400
+; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:396
+; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:392
+; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:388
+; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:384
+; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:380
+; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:376
+; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:372
+; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:368
+; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:364
+; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:360
+; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:356
+; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:352
+; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:348
+; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:344
+; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:340
+; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:336
+; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:332
+; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:328
+; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:324
+; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:320
+; GFX11-NEXT:    v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
+; GFX11-NEXT:    v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
+; GFX11-NEXT:    v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
+; GFX11-NEXT:    v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-NEXT:    v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
+; GFX11-NEXT:    v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
+; GFX11-NEXT:    v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:316
+; GFX11-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u16 v16, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u16 v18, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_u16 v20, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_u16 v22, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_u16 v24, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_u16 v26, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_u16 v28, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_u16 v30, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_u16 v31, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_u16 v41, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_u16 v44, off, s32 offset:136
+; GFX11-NEXT:    scratch_load_u16 v45, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_u16 v56, off, s32 offset:152
+; GFX11-NEXT:    scratch_load_u16 v59, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_u16 v60, off, s32 offset:168
+; GFX11-NEXT:    scratch_load_u16 v61, off, s32 offset:176
+; GFX11-NEXT:    scratch_load_u16 v62, off, s32 offset:184
+; GFX11-NEXT:    scratch_load_u16 v63, off, s32 offset:192
+; GFX11-NEXT:    scratch_load_u16 v72, off, s32 offset:200
+; GFX11-NEXT:    scratch_load_u16 v73, off, s32 offset:208
+; GFX11-NEXT:    scratch_load_u16 v74, off, s32 offset:216
+; GFX11-NEXT:    scratch_load_u16 v75, off, s32 offset:224
+; GFX11-NEXT:    scratch_load_u16 v76, off, s32 offset:232
+; GFX11-NEXT:    scratch_load_u16 v77, off, s32 offset:240
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_load_u16 v78, off, s32 offset:248
+; GFX11-NEXT:    scratch_load_u16 v79, off, s32 offset:256
+; GFX11-NEXT:    scratch_load_u16 v88, off, s32 offset:264
+; GFX11-NEXT:    scratch_load_u16 v89, off, s32 offset:272
+; GFX11-NEXT:    scratch_load_u16 v90, off, s32 offset:280
+; GFX11-NEXT:    scratch_load_u16 v91, off, s32 offset:288
+; GFX11-NEXT:    scratch_load_u16 v92, off, s32 offset:296
+; GFX11-NEXT:    scratch_load_u16 v93, off, s32 offset:304
+; GFX11-NEXT:    scratch_load_u16 v94, off, s32 offset:312
+; GFX11-NEXT:    scratch_load_u16 v57, off, s32 offset:308
+; GFX11-NEXT:    scratch_load_u16 v58, off, s32 offset:300
+; GFX11-NEXT:    scratch_load_u16 v46, off, s32 offset:292
+; GFX11-NEXT:    scratch_load_u16 v47, off, s32 offset:284
+; GFX11-NEXT:    scratch_load_u16 v40, off, s32 offset:276
+; GFX11-NEXT:    scratch_load_u16 v43, off, s32 offset:268
+; GFX11-NEXT:    scratch_load_u16 v182, off, s32 offset:260
+; GFX11-NEXT:    scratch_load_u16 v183, off, s32 offset:252
+; GFX11-NEXT:    scratch_load_u16 v178, off, s32 offset:244
+; GFX11-NEXT:    scratch_load_u16 v181, off, s32 offset:236
+; GFX11-NEXT:    scratch_load_u16 v164, off, s32 offset:228
+; GFX11-NEXT:    scratch_load_u16 v176, off, s32 offset:220
+; GFX11-NEXT:    scratch_load_u16 v160, off, s32 offset:212
+; GFX11-NEXT:    scratch_load_u16 v163, off, s32 offset:204
+; GFX11-NEXT:    scratch_load_u16 v146, off, s32 offset:196
+; GFX11-NEXT:    scratch_load_u16 v150, off, s32 offset:188
+; GFX11-NEXT:    scratch_load_u16 v135, off, s32 offset:180
+; GFX11-NEXT:    scratch_load_u16 v145, off, s32 offset:172
+; GFX11-NEXT:    scratch_load_u16 v118, off, s32 offset:164
+; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_u16 v115, off, s32 offset:148
+; GFX11-NEXT:    scratch_load_u16 v179, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_u16 v162, off, s32 offset:132
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_load_u16 v165, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_u16 v149, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_u16 v151, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_u16 v144, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_u16 v148, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_u16 v133, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_u16 v119, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u16 v114, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u16 v100, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u16 v98, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 8, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 8, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 8, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 8, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 8, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 8, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 8, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 8, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 8, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 8, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 8, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 8, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 8, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 8, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 8, v29
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v97, 8, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v87, 8, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v101, 8, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v102, 8, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v113, 8, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v112, 8, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v130, 8, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v128, 8, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v134, 8, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v132, 8, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v161, 8, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v147, 8, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v166, 8, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v167, 8, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v180, 8, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v177, 8, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v42, 8, v41
+; GFX11-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v41, 8, v44
+; GFX11-NEXT:    s_waitcnt vmcnt(60)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v45, 8, v45
+; GFX11-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v44, 8, v56
+; GFX11-NEXT:    s_waitcnt vmcnt(58)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v59, 8, v59
+; GFX11-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v56, 8, v60
+; GFX11-NEXT:    s_waitcnt vmcnt(56)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v60, 8, v61
+; GFX11-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v61, 8, v62
+; GFX11-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v63, 8, v63
+; GFX11-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v62, 8, v72
+; GFX11-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v73, 8, v73
+; GFX11-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v72, 8, v74
+; GFX11-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v75, 8, v75
+; GFX11-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v74, 8, v76
+; GFX11-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v77, 8, v77
+; GFX11-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v76, 8, v78
+; GFX11-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v78, 8, v79
+; GFX11-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v79, 8, v88
+; GFX11-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v89, 8, v89
+; GFX11-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v88, 8, v90
+; GFX11-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v91, 8, v91
+; GFX11-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v90, 8, v92
+; GFX11-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v92, 8, v93
+; GFX11-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v93, 8, v94
+; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cbranch_scc0 .LBB97_4
+; GFX11-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-NEXT:    s_and_b32 s5, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s29, 8
+; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-NEXT:    v_and_b32_e64 v5, 0xffff, s5
+; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-NEXT:    s_and_b32 s11, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s12, s27, 8
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-NEXT:    s_pack_ll_b32_b16 s8, s9, s10
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v36
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v32
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v34
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v33
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v68
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v64
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v66
+; GFX11-NEXT:    v_or_b32_e32 v6, v4, v67
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v65
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshl_or_b32 v4, v0, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v38
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v39
+; GFX11-NEXT:    v_lshl_or_b32 v6, v6, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v49
+; GFX11-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v37
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v70
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v50
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v71
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v48
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v69
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v82
+; GFX11-NEXT:    v_or_b32_e32 v9, v7, v80
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, v8, v81
+; GFX11-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v53
+; GFX11-NEXT:    v_lshl_or_b32 v8, v9, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v55
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v51
+; GFX11-NEXT:    v_lshl_or_b32 v9, v10, 16, v3
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v84
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v52
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v54
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v86
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v83
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v96
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v85
+; GFX11-NEXT:    v_or_b32_e32 v12, v10, v97
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v11, v87
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v99
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-NEXT:    v_lshl_or_b32 v11, v3, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v103
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v114
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v98
+; GFX11-NEXT:    v_lshl_or_b32 v12, v0, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v100
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v113
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v101
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v116
+; GFX11-NEXT:    v_or_b32_e32 v17, v14, v128
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v112
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v117
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v102
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_or_b32_e32 v13, v13, v130
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v133
+; GFX11-NEXT:    v_or_b32_e32 v20, v14, v132
+; GFX11-NEXT:    v_lshl_or_b32 v14, v0, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v148
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v119
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v129
+; GFX11-NEXT:    v_or_b32_e32 v16, v16, v161
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-NEXT:    v_lshl_or_b32 v13, v2, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v166
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v144
+; GFX11-NEXT:    v_or_b32_e32 v15, v15, v134
+; GFX11-NEXT:    v_or_b32_e32 v18, v18, v147
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v167
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v15
+; GFX11-NEXT:    v_lshl_or_b32 v15, v17, 16, v19
+; GFX11-NEXT:    v_lshl_or_b32 v17, v18, 16, v22
+; GFX11-NEXT:    v_mov_b32_e32 v2, s7
+; GFX11-NEXT:    v_lshl_or_b32 v18, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v151
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v149
+; GFX11-NEXT:    v_lshl_or_b32 v16, v20, 16, v21
+; GFX11-NEXT:    v_mov_b32_e32 v3, s8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v180
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v177
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v19, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v165
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v162
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v42
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v41
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v20, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v179
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v115
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v45
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v44
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v21, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v131
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v118
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v59
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v56
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v22, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v145
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v135
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v60
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v61
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v23, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v150
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v146
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v63
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v62
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v24, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v163
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v160
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v73
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v72
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v25, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v176
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v164
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v75
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v74
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v26, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v181
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v178
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v77
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v76
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v27, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v183
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v182
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v78
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v79
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v28, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v43
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v40
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v89
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v88
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v29, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v47
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v46
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v91
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v90
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v58
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v57
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v92
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v93
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_vccnz .LBB97_3
+; GFX11-NEXT:  .LBB97_2: ; %cmp.true
+; GFX11-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v58
+; GFX11-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v57
+; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v47
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v92, v0
+; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v46
+; GFX11-NEXT:    v_or_b32_e32 v1, v93, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v91, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v43
+; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x300, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v40
+; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x300, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v4
+; GFX11-NEXT:    v_or_b32_e32 v3, v90, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v183
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v182
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v89, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v30, 0x300, v3
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v181
+; GFX11-NEXT:    v_or_b32_e32 v0, v88, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v78, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v79, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v29, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v178
+; GFX11-NEXT:    v_add_nc_u32_e32 v182, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v178, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v77, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v176
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v164
+; GFX11-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v163
+; GFX11-NEXT:    v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v76, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v160
+; GFX11-NEXT:    v_or_b32_e32 v1, v75, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v160, 0x300, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v74, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v73, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v150
+; GFX11-NEXT:    v_add_nc_u32_e32 v26, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v72, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v146
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v145
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v135
+; GFX11-NEXT:    v_add_nc_u32_e32 v25, 0x300, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v63, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v131
+; GFX11-NEXT:    v_or_b32_e32 v0, v62, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v131, 0x300, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v60, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v61, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v24, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v118
+; GFX11-NEXT:    v_add_nc_u32_e32 v135, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v118, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v59, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v179
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v115
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v165
+; GFX11-NEXT:    v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v56, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v162
+; GFX11-NEXT:    v_or_b32_e32 v1, v45, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v44, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v42, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v151
+; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v41, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v149
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v148
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v144
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x300, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v180, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v133
+; GFX11-NEXT:    v_or_b32_e32 v0, v177, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v19, 0x300, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v166, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v167, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v133, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v129
+; GFX11-NEXT:    v_add_nc_u32_e32 v129, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v144, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v161, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v119
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v117
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v116
+; GFX11-NEXT:    v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v147, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v114
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v99
+; GFX11-NEXT:    v_add_nc_u32_e32 v114, 0x300, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v132, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v130, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v103
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v98
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v54
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v53
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v39
+; GFX11-NEXT:    v_add_nc_u32_e32 v33, 3, v33
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x300, v0
+; GFX11-NEXT:    v_or_b32_e32 v4, v113, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v128, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v100
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-NEXT:    v_or_b32_e32 v4, v101, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v102, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v96
+; GFX11-NEXT:    v_or_b32_e32 v1, v134, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v4, v97, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v55
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v52
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v87, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v51
+; GFX11-NEXT:    v_or_b32_e32 v4, v86, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v85, v6
+; GFX11-NEXT:    v_or_b32_e32 v6, v84, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v51, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v50
+; GFX11-NEXT:    v_add_nc_u32_e32 v50, 0x300, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v49
+; GFX11-NEXT:    v_or_b32_e32 v5, v83, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v48
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v7
+; GFX11-NEXT:    v_or_b32_e32 v4, v82, v4
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v38
+; GFX11-NEXT:    v_or_b32_e32 v5, v81, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v38, 0x300, v4
+; GFX11-NEXT:    v_or_b32_e32 v4, v71, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v80, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v37
+; GFX11-NEXT:    v_add_nc_u32_e32 v37, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v39, 0x300, v7
+; GFX11-NEXT:    v_or_b32_e32 v4, v70, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v36
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v34
+; GFX11-NEXT:    v_add_nc_u32_e32 v34, 3, v35
+; GFX11-NEXT:    v_add_nc_u32_e32 v35, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v7
+; GFX11-NEXT:    v_or_b32_e32 v5, v69, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v34
+; GFX11-NEXT:    v_or_b32_e32 v3, v112, v3
+; GFX11-NEXT:    v_or_b32_e32 v4, v68, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v67, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v66, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v33
+; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v32
+; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x300, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v65, v8
+; GFX11-NEXT:    v_and_b32_e64 v8, 0xffff, s4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v6
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_or_b32_e32 v4, v64, v4
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-NEXT:    v_lshl_or_b32 v10, v10, 16, v36
+; GFX11-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v33
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v37
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_lshl_or_b32 v6, v32, 16, v7
+; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v35
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v51
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v38
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-NEXT:    v_lshl_or_b32 v7, v34, 16, v8
+; GFX11-NEXT:    v_lshl_or_b32 v8, v39, 16, v33
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v12
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v11
+; GFX11-NEXT:    v_lshl_or_b32 v11, v50, 16, v32
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v1
+; GFX11-NEXT:    v_lshl_or_b32 v9, v9, 16, v35
+; GFX11-NEXT:    v_lshl_or_b32 v12, v15, 16, v14
+; GFX11-NEXT:    v_lshl_or_b32 v13, v13, 16, v33
+; GFX11-NEXT:    v_lshl_or_b32 v14, v3, 16, v34
+; GFX11-NEXT:    v_lshl_or_b32 v16, v16, 16, v32
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v116
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v129
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v18
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v17
+; GFX11-NEXT:    v_lshl_or_b32 v26, v26, 16, v36
+; GFX11-NEXT:    v_lshl_or_b32 v17, v114, 16, v32
+; GFX11-NEXT:    v_lshl_or_b32 v18, v144, 16, v33
+; GFX11-NEXT:    v_lshl_or_b32 v20, v20, 16, v34
+; GFX11-NEXT:    v_lshl_or_b32 v21, v21, 16, v35
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v115
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v135
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v131
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v23
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v27
+; GFX11-NEXT:    v_lshl_or_b32 v22, v145, 16, v32
+; GFX11-NEXT:    v_lshl_or_b32 v23, v118, 16, v33
+; GFX11-NEXT:    v_lshl_or_b32 v24, v24, 16, v34
+; GFX11-NEXT:    v_lshl_or_b32 v25, v25, 16, v35
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v163
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v182
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v181
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v28
+; GFX11-NEXT:    v_lshl_or_b32 v15, v2, 16, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    v_lshl_or_b32 v19, v133, 16, v19
+; GFX11-NEXT:    v_lshl_or_b32 v27, v160, 16, v32
+; GFX11-NEXT:    v_lshl_or_b32 v28, v178, 16, v33
+; GFX11-NEXT:    v_lshl_or_b32 v29, v29, 16, v34
+; GFX11-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
+; GFX11-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
+; GFX11-NEXT:  .LBB97_3: ; %end
+; GFX11-NEXT:    s_clause 0x1e
+; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:320
+; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:324
+; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:328
+; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:332
+; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:336
+; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:340
+; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:344
+; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:348
+; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:352
+; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:356
+; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:360
+; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:364
+; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:368
+; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:372
+; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:376
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:380
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:384
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:388
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:392
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:396
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:400
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:404
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:408
+; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:412
+; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:416
+; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:420
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:424
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:428
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:432
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:436
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:440
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-NEXT:  .LBB97_4:
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-NEXT:    s_branch .LBB97_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index 38302a75fe26d..32ccaa73b3a8a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -6290,8 +6290,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v32, off, s32
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
@@ -6320,8 +6320,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v31.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v31
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB26_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -13309,8 +13309,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v32, off, s32
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
@@ -13339,8 +13339,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v31.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v31
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB50_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -19846,8 +19846,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v32, off, s32
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
@@ -19876,8 +19876,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v31.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v31
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB70_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -25873,8 +25873,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v32, off, s32
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
@@ -25903,8 +25903,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v31.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v31
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB86_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -31625,8 +31625,8 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v32, off, s32
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v13.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v11.l
@@ -31655,8 +31655,8 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v31.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v31
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB98_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -36525,8 +36525,8 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v32, off, s32
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v13.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v11.l
@@ -36555,8 +36555,8 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v31.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v31
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB106_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -40370,8 +40370,8 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v32, off, s32
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v13.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v11.l
@@ -40400,8 +40400,8 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v31.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v31
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB110_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 436b1a038b274..bb4fd7b6f1e88 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -5019,21 +5019,22 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x9
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v36, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v37, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v38, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v39, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v31, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v33, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v34, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v25.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v17.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v14.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v10.l
@@ -5050,7 +5051,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v11.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v13.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v35.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v31.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v30.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v29.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v28.h
@@ -5059,17 +5060,12 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v33.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v33.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v34.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v36
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v35
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB14_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -5119,14 +5115,14 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v33.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v27, v4
 ; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v5.l, v14.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v27.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v12.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v11.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v32.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v31.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v27, v5
 ; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v6.l, v13.l
@@ -5147,10 +5143,10 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
@@ -5265,14 +5261,14 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v12.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v12.h, v6.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v25, v9
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v32.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v31.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v25.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v11.l, v7.l
@@ -11979,21 +11975,22 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x9
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v36, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v37, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v38, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v39, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v31, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v33, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v34, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v25.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v17.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v14.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v12.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v10.l
@@ -12010,7 +12007,7 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v11.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v13.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v35.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v31.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v30.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v29.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v28.h
@@ -12019,17 +12016,12 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v33.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v33.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v34.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v34.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v35.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v36
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v35
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB34_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -12079,14 +12071,14 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v33.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v27, v4
 ; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v5.l, v14.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v27.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v12.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v11.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v32.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v31.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v27, v5
 ; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v6.l, v13.l
@@ -12107,10 +12099,10 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
@@ -12225,14 +12217,14 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v12.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v12.h, v6.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v25, v9
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v32.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v31.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v25.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v11.l, v7.l
@@ -18549,17 +18541,17 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x9
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v29.l
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v33, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v38, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v39, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v49, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v34, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v36, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v35, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v37, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v25.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v23.l
@@ -18592,21 +18584,22 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v29.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v28.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v35.h
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.l, 8, v28.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v31.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v33.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v38.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v34.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.l, 8, v39.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v48.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v36.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v37
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v49.l
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB50_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -18634,10 +18627,10 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v34.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v18.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v18.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v19.h
@@ -18651,13 +18644,13 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v29.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v29.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v30.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v33.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v35.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v34.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v33.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
@@ -18674,10 +18667,10 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
@@ -18691,33 +18684,33 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB50_2
 ; GFX11-TRUE16-NEXT:  .LBB50_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v34.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v35.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v34.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v35.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v33.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v34.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v35.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v32.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v33.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v31.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v32.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v33.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v1.l
@@ -18734,7 +18727,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v23.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v28.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v30.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v33.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v31.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v29.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v0.l
@@ -24630,17 +24623,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x9
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v29.l
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v33, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v38, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v39, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v49, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v34, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v36, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v35, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v37, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v25.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v23.l
@@ -24673,21 +24666,22 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v29.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v29.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v28.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v35.h
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.l, 8, v28.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v31.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v33.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v38.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v34.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.l, 8, v39.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v48.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v36.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v37
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v49.l
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB62_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -24715,10 +24709,10 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v34.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v18.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v18.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v19.h
@@ -24732,13 +24726,13 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v29.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v29.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v30.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v33.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v35.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v34.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v33.l
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
@@ -24755,10 +24749,10 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
@@ -24772,33 +24766,33 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB62_2
 ; GFX11-TRUE16-NEXT:  .LBB62_4: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v34.l, 3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v35.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v34.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v35.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v33.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v34.l, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v35.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v32.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v33.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v31.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v32.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v33.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v1.l
@@ -24815,7 +24809,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v23.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v28.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v30.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v33.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v31.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v29.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v0.l
@@ -28750,20 +28744,24 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x9
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v29.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v25.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v39, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v49, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v50, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v51, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v34, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v35, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v36, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v37, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v29.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v27.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v25.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v18.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v16.l
@@ -28773,12 +28771,12 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v8.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v7.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v9.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v11.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v13.l
@@ -28786,23 +28784,18 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v17.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v19.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v21.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v38.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v37.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v36.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v34.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v36.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v37.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v38.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v49
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v38
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB72_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -28810,23 +28803,22 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB72_4
 ; GFX11-TRUE16-NEXT:  .LBB72_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB72_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v33.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v30.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v0.l, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v34.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v0.l, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v32.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v31.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v27.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v10, v0
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v1.h, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v1.h, v31.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v21.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v20.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v29.l
@@ -28851,21 +28843,20 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v19.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v36.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v10, v4
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v5.l, v21.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v18.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v34.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v5
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v6.l, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v10.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
@@ -28880,14 +28871,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
@@ -28921,8 +28912,8 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB72_2
 ; GFX11-TRUE16-NEXT:  .LBB72_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v30.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v29.h, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, 0
@@ -28931,10 +28922,10 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v34.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v34.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v33.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v33.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v32.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v32.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v31.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v31.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v27.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v0.h
@@ -28997,16 +28988,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v18.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v18.h, v6.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v36.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v10, v9
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v34.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v17.l, v7.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v17.h, v7.h
@@ -32861,20 +32851,24 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x9
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v29.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v25.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v39, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v49, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v50, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v51, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v34, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v35, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v36, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v37, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v29.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v27.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v25.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v18.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v16.l
@@ -32884,12 +32878,12 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v8.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v7.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v9.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v11.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v13.l
@@ -32897,23 +32891,18 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v17.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v19.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v21.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v48.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v38.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v37.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v36.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v34.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v36.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v36.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v37.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v38.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v49
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v38
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB76_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -32921,23 +32910,22 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB76_4
 ; GFX11-TRUE16-NEXT:  .LBB76_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB76_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v33.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v30.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v0.l, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v34.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v0.l, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v32.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v31.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v27.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v10, v0
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v1.h, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v1.h, v31.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v21.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v20.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v29.l
@@ -32962,21 +32950,20 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v19.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v36.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v10, v4
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v5.l, v21.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v18.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v34.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v5
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v6.l, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v10.h
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
@@ -32991,14 +32978,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
@@ -33032,8 +33019,8 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB76_2
 ; GFX11-TRUE16-NEXT:  .LBB76_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v30.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v29.h, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, 0
@@ -33042,10 +33029,10 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v34.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v34.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v33.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v33.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v32.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v32.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v31.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v31.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v27.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v0.h
@@ -33108,16 +33095,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v18.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v18.h, v6.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v36.l, 3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v10, v9
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v34.l, 3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v17.l, v7.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v17.h, v7.h
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 35d135b123969..fd62d4087c460 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -12446,107 +12446,107 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v80, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v81, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v49, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v82, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v50, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v83, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v51, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v84, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v52, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v85, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v86, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v87, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v96, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v97, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v98, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v99, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v100, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v101, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v102, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v103, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v112, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v113, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v53, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v54, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v55, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v64, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v66, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v67, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v68, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v69, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v70, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v71, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v22.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v20.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v18.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.l, 8, v19.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v21.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v23.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v25.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v48.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v80.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v64.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v81.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v82.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v83.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v84.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v85.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v87.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v67.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v96.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v97.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v68.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v98.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v99.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v69.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v100.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v101.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v102.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v103.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v112.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v80.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v86
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB26_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -12557,95 +12557,95 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB26_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v0.l, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v80.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v80, v0
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v1.h, v37.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v29.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v35.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v80, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v2.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v32.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v80, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v3.l, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v30.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v80, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v4.l, v31.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v70.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v80, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v5.l, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v25.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v23.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v80, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v6.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v67.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v22.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v80, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v21.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v80, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v20.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v80, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v9.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v51.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v18.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v80, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v10.l, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
@@ -12654,32 +12654,32 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
@@ -12697,199 +12697,199 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v80, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v16.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v80, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v12.l, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v80, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v80, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v14.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v80, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v15.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v80, v15
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX11-TRUE16-NEXT:  .LBB26_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v38.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v39.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v38.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v37.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v36, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v35.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v35.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v36, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v34.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v34.h, v2.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v29.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v36, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v26.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v31.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v32.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v36, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v29.h, v4.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v30.h, v4.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v36, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v28.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v27.l, v5.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v27.h, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v36, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v25.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v25.h, v6.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v71.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v70.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v36, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v69.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v68.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v23.l, v7.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v23.h, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v36, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v22.l, v8.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v66.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v36, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v64.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v21.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v21.h, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v36, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v20.l, v10.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v55.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v54.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v36, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v53.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v19.l, v11.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v19.h, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v36, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v18.l, v12.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v51.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v36, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v17.l, v13.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v17.h, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v36, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v16.l, v14.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v16.h, v14.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v36, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v52, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v36, v15
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14669,775 +14669,390 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_branch .LBB27_2
 ;
-; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16i32_scalar:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v8, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v10, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB27_4
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s17, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s19, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s21, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s9, 0xffff
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v38
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v83
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v85
-; GFX11-TRUE16-NEXT:    s_and_b32 s11, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v68
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v69
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v71
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v80
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v81
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v65
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v30
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v51
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v50
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xff, v39
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v55
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, v86, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v96, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v87
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v96, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v86
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB27_3
-; GFX11-TRUE16-NEXT:  .LBB27_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v31
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v32
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v38
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 3, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v83, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v84, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v85, v2
-; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v82, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v68, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v69, v11
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s29, 8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v35
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v34
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v36
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v52
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v70, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v71, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v80, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v65, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v66, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v67, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v26
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v28
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v30
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v51
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v50
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 3, v49
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 3, v48
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 3, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v27, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v29, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v55, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v18, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:  .LBB27_3: ; %end
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB27_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_branch .LBB27_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v16i32_scalar:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v52, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v53, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB27_4
-; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s17, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s19, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s21, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s9, 0xffff
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s27, 8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s9, 16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v38
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v83
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v84
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v24
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v85
-; GFX11-FAKE16-NEXT:    s_and_b32 s11, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s29, 8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v82
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v68
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v69
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s10, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v71
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v80
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v81
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v65
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v66
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v67
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v52
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v64
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v26
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v28
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v30
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v51
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v50
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v49
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v48
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xff, v39
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v54
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v55
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v21
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v86, v86, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v96, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v87
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v96, v14
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v86
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB27_3
-; GFX11-FAKE16-NEXT:  .LBB27_2: ; %cmp.true
-; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v31
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v32
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v38
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v33
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 3, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v24
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v83, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v84, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v85, v2
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v82, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v68, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v69, v11
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s29, 8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s7, s6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v35
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v34
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v36
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v18
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v52
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v70, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v71, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v80, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v65, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v66, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v67, v9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v64, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v26
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v28
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v30
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v51
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v50
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 3, v49
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 3, v48
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 3, v39
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v27, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v29, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v54, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v55, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v17, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v19, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v21, v14
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v23, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v25, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v14
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT:  .LBB27_3: ; %end
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB27_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT:    s_branch .LBB27_2
+; GFX11-LABEL: bitcast_v64i8_to_v16i32_scalar:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
+; GFX11-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
+; GFX11-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u16 v39, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u16 v4, off, s32
+; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u16 v48, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u16 v49, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u16 v50, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u16 v51, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v52, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v53, off, s32 offset:4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 8, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 8, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 8, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 8, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 8, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 8, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 8, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 8, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 8, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 8, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 8, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 8, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 8, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cbranch_scc0 .LBB27_4
+; GFX11-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-NEXT:    s_lshl_b32 s7, s17, 8
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_and_b32 s6, s16, 0xff
+; GFX11-NEXT:    s_and_b32 s8, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s19, 8
+; GFX11-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-NEXT:    s_or_b32 s7, s8, s9
+; GFX11-NEXT:    s_and_b32 s6, s6, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX11-NEXT:    s_and_b32 s8, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s21, 8
+; GFX11-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-NEXT:    s_or_b32 s7, s8, s9
+; GFX11-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_and_b32 s8, s9, 0xffff
+; GFX11-NEXT:    s_and_b32 s9, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s27, 8
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-NEXT:    s_lshl_b32 s9, s9, 16
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v38
+; GFX11-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v83
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v84
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v22
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v24
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v85
+; GFX11-NEXT:    s_and_b32 s11, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s12, s29, 8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_or_b32_e32 v6, v6, v82
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v68
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v69
+; GFX11-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-NEXT:    v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT:    s_and_b32 s10, s10, 0xffff
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT:    v_or_b32_e32 v4, s10, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v36
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v37
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v16
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v18
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v20
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v70
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v71
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v80
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v81
+; GFX11-NEXT:    v_or_b32_e32 v6, v6, v65
+; GFX11-NEXT:    v_or_b32_e32 v7, v7, v66
+; GFX11-NEXT:    v_or_b32_e32 v9, v9, v67
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v52
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v64
+; GFX11-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT:    v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v26
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v28
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v30
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v53
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v51
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v50
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v49
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v48
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xff, v39
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v27
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v29
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v54
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v55
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v17
+; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-NEXT:    v_or_b32_e32 v14, v14, v21
+; GFX11-NEXT:    v_or_b32_e32 v15, v15, v23
+; GFX11-NEXT:    v_or_b32_e32 v86, v86, v25
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v87, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v96, 0xffff, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-NEXT:    v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v13, v13, v87
+; GFX11-NEXT:    v_or_b32_e32 v14, v96, v14
+; GFX11-NEXT:    v_or_b32_e32 v15, v15, v86
+; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_vccnz .LBB27_3
+; GFX11-NEXT:  .LBB27_2: ; %cmp.true
+; GFX11-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_and_b32 s1, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s2, s17, 8
+; GFX11-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-NEXT:    s_or_b32 s1, s2, s1
+; GFX11-NEXT:    s_and_b32 s2, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s19, 8
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_and_b32 s3, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s21, 8
+; GFX11-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-NEXT:    s_or_b32 s1, s1, s2
+; GFX11-NEXT:    s_or_b32 s2, s4, s3
+; GFX11-NEXT:    s_and_b32 s3, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s23, 8
+; GFX11-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-NEXT:    s_or_b32 s3, s4, s3
+; GFX11-NEXT:    s_and_b32 s4, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s25, 8
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_addk_i32 s3, 0x300
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT:    s_and_b32 s4, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s27, 8
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v31
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v32
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v38
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    s_or_b32 s3, s3, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v33
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 3, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v24
+; GFX11-NEXT:    v_or_b32_e32 v0, v83, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v84, v1
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_or_b32_e32 v2, v85, v2
+; GFX11-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_or_b32_e32 v6, v82, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, v68, v10
+; GFX11-NEXT:    v_or_b32_e32 v11, v69, v11
+; GFX11-NEXT:    s_and_b32 s6, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s7, s29, 8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_or_b32 s5, s7, s6
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-NEXT:    v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v35
+; GFX11-NEXT:    v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v34
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v36
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v37
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v16
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v18
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v20
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v52
+; GFX11-NEXT:    v_or_b32_e32 v0, v70, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v71, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v80, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v81, v3
+; GFX11-NEXT:    v_or_b32_e32 v6, v65, v6
+; GFX11-NEXT:    v_or_b32_e32 v7, v66, v7
+; GFX11-NEXT:    v_or_b32_e32 v9, v67, v9
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x300, v9
+; GFX11-NEXT:    v_or_b32_e32 v11, v64, v11
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT:    v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v26
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v28
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v30
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v53
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v51
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v50
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 3, v49
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 3, v48
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 3, v39
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT:    v_or_b32_e32 v0, v27, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v29, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v54, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v55, v3
+; GFX11-NEXT:    v_or_b32_e32 v11, v17, v11
+; GFX11-NEXT:    v_or_b32_e32 v12, v19, v12
+; GFX11-NEXT:    v_or_b32_e32 v14, v21, v14
+; GFX11-NEXT:    v_or_b32_e32 v15, v23, v15
+; GFX11-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x300, v14
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v13, v13, v17
+; GFX11-NEXT:    v_or_b32_e32 v14, v18, v14
+; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:  .LBB27_3: ; %end
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-NEXT:  .LBB27_4:
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT:    s_branch .LBB27_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -27320,107 +26935,107 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v80, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v81, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v49, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v82, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v50, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v83, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v51, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v84, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v52, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v85, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v86, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v87, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v96, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v97, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v98, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v99, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v100, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v101, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v102, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v103, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v112, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v113, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v53, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v54, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v55, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v64, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v66, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v67, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v68, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v69, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v70, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v71, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v22.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v20.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v18.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.l, 8, v19.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v21.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v23.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v25.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v48.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v80.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v64.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v81.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v82.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v83.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v84.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v85.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v87.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v67.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v96.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v97.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v68.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v98.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v99.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v69.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v100.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v101.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v102.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v103.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v112.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v80.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v86
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB50_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -27431,95 +27046,95 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB50_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v0.l, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v80.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v80, v0
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v1.h, v37.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v29.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v35.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v80, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v2.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v32.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v80, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v3.l, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v30.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v80, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v4.l, v31.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v70.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v80, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v5.l, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v25.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v23.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v80, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v6.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v67.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v22.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v80, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v21.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v80, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v20.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v80, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v9.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v51.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v18.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v80, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v10.l, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
@@ -27528,32 +27143,32 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
@@ -27571,199 +27186,199 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v80, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v16.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v80, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v12.l, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v80, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v80, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v14.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v80, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v15.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v80, v15
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB50_2
 ; GFX11-TRUE16-NEXT:  .LBB50_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v38.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v39.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v38.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v37.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v36, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v35.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v35.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v36, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v34.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v34.h, v2.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v29.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v36, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v26.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v31.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v32.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v36, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v29.h, v4.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v30.h, v4.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v36, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v28.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v27.l, v5.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v27.h, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v36, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v25.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v25.h, v6.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v71.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v70.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v36, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v69.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v68.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v23.l, v7.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v23.h, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v36, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v22.l, v8.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v66.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v36, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v64.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v21.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v21.h, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v36, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v20.l, v10.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v55.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v54.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v36, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v53.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v19.l, v11.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v19.h, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v36, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v18.l, v12.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v51.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v36, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v17.l, v13.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v17.h, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v36, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v16.l, v14.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v16.h, v14.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v36, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v52, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v36, v15
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -29543,775 +29158,390 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_branch .LBB51_2
 ;
-; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16f32_scalar:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v8, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v10, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB51_4
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s17, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s19, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s21, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s9, 0xffff
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v38
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v83
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v85
-; GFX11-TRUE16-NEXT:    s_and_b32 s11, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v68
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v69
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v71
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v80
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v81
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v65
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v30
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v51
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v50
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xff, v39
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v55
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, v86, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v96, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v87
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v96, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v86
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB51_3
-; GFX11-TRUE16-NEXT:  .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v31
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v32
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v38
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 3, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v83, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v84, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v85, v2
-; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v82, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v68, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v69, v11
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s29, 8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v35
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v34
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v36
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v52
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v70, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v71, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v80, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v65, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v66, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v67, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v26
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v28
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v30
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v51
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v50
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 3, v49
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 3, v48
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 3, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v27, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v29, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v55, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v18, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:  .LBB51_3: ; %end
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB51_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_branch .LBB51_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v16f32_scalar:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v52, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v53, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB51_4
-; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s17, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s19, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s21, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s9, 0xffff
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s27, 8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s9, 16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v38
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v83
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v84
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v24
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v85
-; GFX11-FAKE16-NEXT:    s_and_b32 s11, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s29, 8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v82
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v68
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v69
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s10, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v71
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v80
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v81
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v65
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v66
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v67
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v52
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v64
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v26
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v28
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v30
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v51
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v50
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v49
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v48
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xff, v39
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v54
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v55
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v21
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v86, v86, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v96, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v87
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v96, v14
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v86
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB51_3
-; GFX11-FAKE16-NEXT:  .LBB51_2: ; %cmp.true
-; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v31
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v32
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v38
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v33
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 3, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v24
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v83, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v84, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v85, v2
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v82, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v68, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v69, v11
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s29, 8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s7, s6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v35
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v34
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v36
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v18
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v52
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v70, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v71, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v80, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v65, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v66, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v67, v9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v64, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v26
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v28
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v30
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v51
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v50
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 3, v49
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 3, v48
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 3, v39
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v27, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v29, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v54, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v55, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v17, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v19, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v21, v14
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v23, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v25, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v14
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT:  .LBB51_3: ; %end
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB51_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT:    s_branch .LBB51_2
+; GFX11-LABEL: bitcast_v64i8_to_v16f32_scalar:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
+; GFX11-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
+; GFX11-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u16 v39, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u16 v4, off, s32
+; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u16 v48, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u16 v49, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u16 v50, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u16 v51, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v52, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v53, off, s32 offset:4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 8, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 8, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 8, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 8, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 8, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 8, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 8, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 8, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 8, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 8, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 8, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 8, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 8, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cbranch_scc0 .LBB51_4
+; GFX11-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-NEXT:    s_lshl_b32 s7, s17, 8
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_and_b32 s6, s16, 0xff
+; GFX11-NEXT:    s_and_b32 s8, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s19, 8
+; GFX11-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-NEXT:    s_or_b32 s7, s8, s9
+; GFX11-NEXT:    s_and_b32 s6, s6, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX11-NEXT:    s_and_b32 s8, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s21, 8
+; GFX11-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-NEXT:    s_or_b32 s7, s8, s9
+; GFX11-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_and_b32 s8, s9, 0xffff
+; GFX11-NEXT:    s_and_b32 s9, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s27, 8
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-NEXT:    s_lshl_b32 s9, s9, 16
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v38
+; GFX11-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v83
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v84
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v22
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v24
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v85
+; GFX11-NEXT:    s_and_b32 s11, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s12, s29, 8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_or_b32_e32 v6, v6, v82
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v68
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v69
+; GFX11-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-NEXT:    v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT:    s_and_b32 s10, s10, 0xffff
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT:    v_or_b32_e32 v4, s10, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v36
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v37
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v16
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v18
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v20
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v70
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v71
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v80
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v81
+; GFX11-NEXT:    v_or_b32_e32 v6, v6, v65
+; GFX11-NEXT:    v_or_b32_e32 v7, v7, v66
+; GFX11-NEXT:    v_or_b32_e32 v9, v9, v67
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v52
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v64
+; GFX11-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT:    v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v26
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v28
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v30
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v53
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v51
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v50
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v49
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v48
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xff, v39
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v27
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v29
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v54
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v55
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v17
+; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-NEXT:    v_or_b32_e32 v14, v14, v21
+; GFX11-NEXT:    v_or_b32_e32 v15, v15, v23
+; GFX11-NEXT:    v_or_b32_e32 v86, v86, v25
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v87, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v96, 0xffff, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-NEXT:    v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v13, v13, v87
+; GFX11-NEXT:    v_or_b32_e32 v14, v96, v14
+; GFX11-NEXT:    v_or_b32_e32 v15, v15, v86
+; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_vccnz .LBB51_3
+; GFX11-NEXT:  .LBB51_2: ; %cmp.true
+; GFX11-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_and_b32 s1, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s2, s17, 8
+; GFX11-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-NEXT:    s_or_b32 s1, s2, s1
+; GFX11-NEXT:    s_and_b32 s2, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s19, 8
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_and_b32 s3, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s21, 8
+; GFX11-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-NEXT:    s_or_b32 s1, s1, s2
+; GFX11-NEXT:    s_or_b32 s2, s4, s3
+; GFX11-NEXT:    s_and_b32 s3, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s23, 8
+; GFX11-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-NEXT:    s_or_b32 s3, s4, s3
+; GFX11-NEXT:    s_and_b32 s4, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s25, 8
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_addk_i32 s3, 0x300
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT:    s_and_b32 s4, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s27, 8
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v31
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v32
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v38
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    s_or_b32 s3, s3, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v33
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 3, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v24
+; GFX11-NEXT:    v_or_b32_e32 v0, v83, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v84, v1
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_or_b32_e32 v2, v85, v2
+; GFX11-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_or_b32_e32 v6, v82, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, v68, v10
+; GFX11-NEXT:    v_or_b32_e32 v11, v69, v11
+; GFX11-NEXT:    s_and_b32 s6, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s7, s29, 8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_or_b32 s5, s7, s6
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-NEXT:    v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v35
+; GFX11-NEXT:    v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v34
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v36
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v37
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v16
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v18
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v20
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v52
+; GFX11-NEXT:    v_or_b32_e32 v0, v70, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v71, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v80, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v81, v3
+; GFX11-NEXT:    v_or_b32_e32 v6, v65, v6
+; GFX11-NEXT:    v_or_b32_e32 v7, v66, v7
+; GFX11-NEXT:    v_or_b32_e32 v9, v67, v9
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x300, v9
+; GFX11-NEXT:    v_or_b32_e32 v11, v64, v11
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT:    v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v26
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v28
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v30
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v53
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v51
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v50
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 3, v49
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 3, v48
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 3, v39
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT:    v_or_b32_e32 v0, v27, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v29, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v54, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v55, v3
+; GFX11-NEXT:    v_or_b32_e32 v11, v17, v11
+; GFX11-NEXT:    v_or_b32_e32 v12, v19, v12
+; GFX11-NEXT:    v_or_b32_e32 v14, v21, v14
+; GFX11-NEXT:    v_or_b32_e32 v15, v23, v15
+; GFX11-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x300, v14
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v13, v13, v17
+; GFX11-NEXT:    v_or_b32_e32 v14, v18, v14
+; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:  .LBB51_3: ; %end
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-NEXT:  .LBB51_4:
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT:    s_branch .LBB51_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -41466,107 +40696,107 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v80, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v81, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v49, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v82, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v50, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v83, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v51, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v84, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v52, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v85, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v86, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v87, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v96, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v97, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v98, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v99, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v100, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v101, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v102, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v103, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v112, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v113, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v53, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v54, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v55, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v64, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v66, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v67, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v68, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v69, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v70, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v71, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v22.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v20.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v18.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.l, 8, v19.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v21.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v23.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v25.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v48.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v80.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v64.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v81.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v82.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v83.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v84.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v85.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v87.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v67.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v96.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v97.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v68.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v98.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v99.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v69.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v100.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v101.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v102.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v103.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v112.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v80.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v86
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB70_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -41577,95 +40807,95 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB70_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v0.l, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v80.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v80, v0
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v1.h, v37.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v29.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v35.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v80, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v2.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v32.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v80, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v3.l, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v30.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v80, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v4.l, v31.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v70.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v80, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v5.l, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v25.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v23.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v80, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v6.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v67.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v22.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v80, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v21.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v80, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v20.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v80, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v9.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v51.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v18.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v80, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v10.l, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
@@ -41674,32 +40904,32 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
@@ -41717,199 +40947,199 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v80, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v16.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v80, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v12.l, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v80, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v80, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v14.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v80, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v15.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v80, v15
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB70_2
 ; GFX11-TRUE16-NEXT:  .LBB70_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v38.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v39.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v38.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v37.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v36, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v35.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v35.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v36, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v34.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v34.h, v2.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v29.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v36, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v26.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v31.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v32.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v36, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v29.h, v4.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v30.h, v4.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v36, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v28.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v27.l, v5.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v27.h, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v36, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v25.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v25.h, v6.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v71.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v70.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v36, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v69.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v68.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v23.l, v7.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v23.h, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v36, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v22.l, v8.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v66.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v36, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v64.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v21.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v21.h, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v36, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v20.l, v10.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v55.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v54.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v36, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v53.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v19.l, v11.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v19.h, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v36, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v18.l, v12.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v51.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v36, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v17.l, v13.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v17.h, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v36, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v16.l, v14.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v16.h, v14.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v36, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v52, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v36, v15
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -43689,775 +42919,390 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_branch .LBB71_2
 ;
-; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8i64_scalar:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v8, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v10, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB71_4
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s17, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s19, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s21, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s9, 0xffff
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v38
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v83
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v85
-; GFX11-TRUE16-NEXT:    s_and_b32 s11, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v68
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v69
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v71
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v80
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v81
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v65
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v30
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v51
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v50
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xff, v39
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v55
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, v86, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v96, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v87
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v96, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v86
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB71_3
-; GFX11-TRUE16-NEXT:  .LBB71_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v31
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v32
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v38
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 3, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v83, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v84, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v85, v2
-; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v82, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v68, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v69, v11
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s29, 8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v35
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v34
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v36
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v52
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v70, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v71, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v80, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v65, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v66, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v67, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v26
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v28
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v30
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v51
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v50
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 3, v49
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 3, v48
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 3, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v27, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v29, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v55, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v18, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:  .LBB71_3: ; %end
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB71_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_branch .LBB71_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v8i64_scalar:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v52, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v53, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB71_4
-; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s17, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s19, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s21, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s9, 0xffff
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s27, 8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s9, 16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v38
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v83
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v84
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v24
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v85
-; GFX11-FAKE16-NEXT:    s_and_b32 s11, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s29, 8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v82
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v68
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v69
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s10, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v71
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v80
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v81
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v65
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v66
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v67
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v52
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v64
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v26
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v28
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v30
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v51
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v50
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v49
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v48
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xff, v39
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v54
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v55
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v21
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v86, v86, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v96, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v87
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v96, v14
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v86
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB71_3
-; GFX11-FAKE16-NEXT:  .LBB71_2: ; %cmp.true
-; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v31
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v32
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v38
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v33
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 3, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v24
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v83, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v84, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v85, v2
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v82, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v68, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v69, v11
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s29, 8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s7, s6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v35
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v34
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v36
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v18
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v52
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v70, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v71, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v80, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v65, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v66, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v67, v9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v64, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v26
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v28
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v30
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v51
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v50
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 3, v49
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 3, v48
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 3, v39
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v27, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v29, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v54, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v55, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v17, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v19, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v21, v14
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v23, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v25, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v14
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT:  .LBB71_3: ; %end
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB71_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT:    s_branch .LBB71_2
+; GFX11-LABEL: bitcast_v64i8_to_v8i64_scalar:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
+; GFX11-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
+; GFX11-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u16 v39, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u16 v4, off, s32
+; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u16 v48, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u16 v49, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u16 v50, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u16 v51, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v52, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v53, off, s32 offset:4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 8, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 8, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 8, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 8, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 8, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 8, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 8, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 8, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 8, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 8, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 8, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 8, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 8, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cbranch_scc0 .LBB71_4
+; GFX11-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-NEXT:    s_lshl_b32 s7, s17, 8
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_and_b32 s6, s16, 0xff
+; GFX11-NEXT:    s_and_b32 s8, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s19, 8
+; GFX11-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-NEXT:    s_or_b32 s7, s8, s9
+; GFX11-NEXT:    s_and_b32 s6, s6, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX11-NEXT:    s_and_b32 s8, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s21, 8
+; GFX11-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-NEXT:    s_or_b32 s7, s8, s9
+; GFX11-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_and_b32 s8, s9, 0xffff
+; GFX11-NEXT:    s_and_b32 s9, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s27, 8
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-NEXT:    s_lshl_b32 s9, s9, 16
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v38
+; GFX11-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v83
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v84
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v22
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v24
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v85
+; GFX11-NEXT:    s_and_b32 s11, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s12, s29, 8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_or_b32_e32 v6, v6, v82
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v68
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v69
+; GFX11-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-NEXT:    v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT:    s_and_b32 s10, s10, 0xffff
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT:    v_or_b32_e32 v4, s10, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v36
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v37
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v16
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v18
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v20
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v70
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v71
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v80
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v81
+; GFX11-NEXT:    v_or_b32_e32 v6, v6, v65
+; GFX11-NEXT:    v_or_b32_e32 v7, v7, v66
+; GFX11-NEXT:    v_or_b32_e32 v9, v9, v67
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v52
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v64
+; GFX11-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT:    v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v26
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v28
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v30
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v53
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v51
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v50
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v49
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v48
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xff, v39
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v27
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v29
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v54
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v55
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v17
+; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-NEXT:    v_or_b32_e32 v14, v14, v21
+; GFX11-NEXT:    v_or_b32_e32 v15, v15, v23
+; GFX11-NEXT:    v_or_b32_e32 v86, v86, v25
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v87, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v96, 0xffff, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-NEXT:    v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v13, v13, v87
+; GFX11-NEXT:    v_or_b32_e32 v14, v96, v14
+; GFX11-NEXT:    v_or_b32_e32 v15, v15, v86
+; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_vccnz .LBB71_3
+; GFX11-NEXT:  .LBB71_2: ; %cmp.true
+; GFX11-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_and_b32 s1, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s2, s17, 8
+; GFX11-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-NEXT:    s_or_b32 s1, s2, s1
+; GFX11-NEXT:    s_and_b32 s2, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s19, 8
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_and_b32 s3, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s21, 8
+; GFX11-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-NEXT:    s_or_b32 s1, s1, s2
+; GFX11-NEXT:    s_or_b32 s2, s4, s3
+; GFX11-NEXT:    s_and_b32 s3, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s23, 8
+; GFX11-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-NEXT:    s_or_b32 s3, s4, s3
+; GFX11-NEXT:    s_and_b32 s4, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s25, 8
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_addk_i32 s3, 0x300
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT:    s_and_b32 s4, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s27, 8
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v31
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v32
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v38
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    s_or_b32 s3, s3, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v33
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 3, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v24
+; GFX11-NEXT:    v_or_b32_e32 v0, v83, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v84, v1
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_or_b32_e32 v2, v85, v2
+; GFX11-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_or_b32_e32 v6, v82, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, v68, v10
+; GFX11-NEXT:    v_or_b32_e32 v11, v69, v11
+; GFX11-NEXT:    s_and_b32 s6, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s7, s29, 8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_or_b32 s5, s7, s6
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-NEXT:    v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v35
+; GFX11-NEXT:    v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v34
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v36
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v37
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v16
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v18
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v20
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v52
+; GFX11-NEXT:    v_or_b32_e32 v0, v70, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v71, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v80, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v81, v3
+; GFX11-NEXT:    v_or_b32_e32 v6, v65, v6
+; GFX11-NEXT:    v_or_b32_e32 v7, v66, v7
+; GFX11-NEXT:    v_or_b32_e32 v9, v67, v9
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x300, v9
+; GFX11-NEXT:    v_or_b32_e32 v11, v64, v11
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT:    v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v26
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v28
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v30
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v53
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v51
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v50
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 3, v49
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 3, v48
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 3, v39
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT:    v_or_b32_e32 v0, v27, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v29, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v54, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v55, v3
+; GFX11-NEXT:    v_or_b32_e32 v11, v17, v11
+; GFX11-NEXT:    v_or_b32_e32 v12, v19, v12
+; GFX11-NEXT:    v_or_b32_e32 v14, v21, v14
+; GFX11-NEXT:    v_or_b32_e32 v15, v23, v15
+; GFX11-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x300, v14
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v13, v13, v17
+; GFX11-NEXT:    v_or_b32_e32 v14, v18, v14
+; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:  .LBB71_3: ; %end
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-NEXT:  .LBB71_4:
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT:    s_branch .LBB71_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -54758,107 +53603,107 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v80, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v81, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v49, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v82, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v50, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v83, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v51, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v84, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v52, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v85, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v86, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v87, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v96, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v97, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v98, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v99, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v100, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v101, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v102, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v103, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v112, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v113, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v53, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v54, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v55, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v64, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v66, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v67, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v68, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v69, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v70, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v71, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v22.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v20.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v18.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.l, 8, v19.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v21.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v23.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v25.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v48.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v80.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v64.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v81.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v82.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v83.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v84.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v85.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v87.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v67.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v96.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v97.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v68.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v98.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v99.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v69.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v100.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v101.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v102.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v103.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v112.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v80.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v81
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v86
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB86_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -54869,95 +53714,95 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB86_3: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v0.l, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v80.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v1.l, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v80, v0
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v1.h, v37.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v29.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v35.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v80, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v2.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v32.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v80, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v3.l, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v30.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v80, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v4.l, v31.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v27.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v70.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v80, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v5.l, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v25.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v23.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v80, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v6.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v67.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v22.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v80, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v21.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v80, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v20.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v19.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v64.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v80, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v9.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v51.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v18.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v80, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v10.l, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v17.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
@@ -54966,32 +53811,32 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
@@ -55009,199 +53854,199 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v80, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v80.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v16.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v80, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v12.l, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v80, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v80, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v14.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT:    v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v80, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v80.l, v15.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v80.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v80, v15
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB86_2
 ; GFX11-TRUE16-NEXT:  .LBB86_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v38.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v39.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v38.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v37.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v33.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v36, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v35.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v35.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v36, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v34.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v34.h, v2.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v29.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v36, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v3.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v26.h, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v31.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v32.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v36, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v29.h, v4.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v30.h, v4.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v36, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v5.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v28.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v27.l, v5.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v27.h, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v36, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v25.l, v6.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v25.h, v6.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v71.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v70.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v36, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v7.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v69.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v68.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v23.l, v7.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v23.h, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v36, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v22.l, v8.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v66.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v36, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v9.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, v64.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v21.l, v9.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v21.h, v9.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v36, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v20.l, v10.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v55.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v54.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v36, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v11.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v53.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v19.l, v11.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v19.h, v11.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v36, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v18.l, v12.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v51.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v36, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v13.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v52.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v17.l, v13.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v17.h, v13.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v36, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v52.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v36.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v16.l, v14.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v16.h, v14.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v36, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v36.l, 0x300, v15.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v52, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v36, v15
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -56981,775 +55826,390 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_branch .LBB87_2
 ;
-; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8f64_scalar:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v8, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v10, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB87_4
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s17, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s19, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s21, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s9, 0xffff
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v38
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v83
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v85
-; GFX11-TRUE16-NEXT:    s_and_b32 s11, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v68
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v69
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v71
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v80
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v81
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v65
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v30
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v51
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v50
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xff, v39
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v27
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v55
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v21
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, v86, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v96, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v87
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v96, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v86
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB87_3
-; GFX11-TRUE16-NEXT:  .LBB87_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v31
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v32
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v38
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 3, v22
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v83, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v84, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v85, v2
-; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v82, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v68, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v69, v11
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s29, 8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v35
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v34
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v36
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v52
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v70, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v71, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v80, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v65, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v66, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v67, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v26
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v28
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v30
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v51
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v50
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 3, v49
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 3, v48
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 3, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v27, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v29, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v55, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v18, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:  .LBB87_3: ; %end
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB87_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_branch .LBB87_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v8f64_scalar:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v52, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v53, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB87_4
-; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s17, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s19, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s21, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s9, 0xffff
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s27, 8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s9, 16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v38
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v83
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v84
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v24
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v85
-; GFX11-FAKE16-NEXT:    s_and_b32 s11, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s29, 8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v82
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v68
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v69
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s10, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v18
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v71
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v80
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v81
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v65
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v66
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v67
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v52
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v64
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v26
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v28
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v30
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v51
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v50
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v49
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v48
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xff, v39
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v27
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v54
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v55
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v21
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v23
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v86, v86, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v96, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v87
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v96, v14
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v86
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB87_3
-; GFX11-FAKE16-NEXT:  .LBB87_2: ; %cmp.true
-; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT:    s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v31
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v32
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v38
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v33
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 3, v22
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v24
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v83, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v84, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v85, v2
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v82, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v68, v10
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v69, v11
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s29, 8
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s7, s6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v35
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v34
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v36
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v37
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v18
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v52
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v70, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v71, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v80, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v65, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v66, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v67, v9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v64, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v26
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v28
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v30
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v51
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v50
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 3, v49
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 3, v48
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 3, v39
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v27, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v29, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v54, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v55, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v17, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v19, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v21, v14
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v23, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v25, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v17
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v14
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT:  .LBB87_3: ; %end
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB87_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT:    s_branch .LBB87_2
+; GFX11-LABEL: bitcast_v64i8_to_v8f64_scalar:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
+; GFX11-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
+; GFX11-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u16 v39, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u16 v4, off, s32
+; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u16 v48, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u16 v49, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u16 v50, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u16 v51, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v52, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v53, off, s32 offset:4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 8, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 8, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 8, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 8, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 8, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 8, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 8, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 8, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 8, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 8, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 8, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 8, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 8, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 8, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 8, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 8, v6
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 8, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 8, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 8, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 8, v14
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cbranch_scc0 .LBB87_4
+; GFX11-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-NEXT:    s_lshl_b32 s7, s17, 8
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_and_b32 s6, s16, 0xff
+; GFX11-NEXT:    s_and_b32 s8, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s19, 8
+; GFX11-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-NEXT:    s_or_b32 s7, s8, s9
+; GFX11-NEXT:    s_and_b32 s6, s6, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX11-NEXT:    s_and_b32 s8, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s21, 8
+; GFX11-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-NEXT:    s_or_b32 s7, s8, s9
+; GFX11-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_and_b32 s8, s9, 0xffff
+; GFX11-NEXT:    s_and_b32 s9, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s27, 8
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-NEXT:    s_lshl_b32 s9, s9, 16
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v38
+; GFX11-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v83
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v84
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v22
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v24
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v85
+; GFX11-NEXT:    s_and_b32 s11, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s12, s29, 8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_or_b32_e32 v6, v6, v82
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v68
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v69
+; GFX11-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-NEXT:    v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT:    s_and_b32 s10, s10, 0xffff
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT:    v_or_b32_e32 v4, s10, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v36
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v37
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v16
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v18
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v20
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v70
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v71
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v80
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v81
+; GFX11-NEXT:    v_or_b32_e32 v6, v6, v65
+; GFX11-NEXT:    v_or_b32_e32 v7, v7, v66
+; GFX11-NEXT:    v_or_b32_e32 v9, v9, v67
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v52
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v64
+; GFX11-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT:    v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v26
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v28
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v30
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v53
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v51
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v50
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v49
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v48
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xff, v39
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v27
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v29
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v54
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v55
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v17
+; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-NEXT:    v_or_b32_e32 v14, v14, v21
+; GFX11-NEXT:    v_or_b32_e32 v15, v15, v23
+; GFX11-NEXT:    v_or_b32_e32 v86, v86, v25
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v87, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v96, 0xffff, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-NEXT:    v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v13, v13, v87
+; GFX11-NEXT:    v_or_b32_e32 v14, v96, v14
+; GFX11-NEXT:    v_or_b32_e32 v15, v15, v86
+; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_vccnz .LBB87_3
+; GFX11-NEXT:  .LBB87_2: ; %cmp.true
+; GFX11-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_and_b32 s1, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s2, s17, 8
+; GFX11-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-NEXT:    s_or_b32 s1, s2, s1
+; GFX11-NEXT:    s_and_b32 s2, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s19, 8
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_and_b32 s3, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s21, 8
+; GFX11-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-NEXT:    s_or_b32 s1, s1, s2
+; GFX11-NEXT:    s_or_b32 s2, s4, s3
+; GFX11-NEXT:    s_and_b32 s3, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s4, s23, 8
+; GFX11-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-NEXT:    s_or_b32 s3, s4, s3
+; GFX11-NEXT:    s_and_b32 s4, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s25, 8
+; GFX11-NEXT:    s_addk_i32 s2, 0x300
+; GFX11-NEXT:    s_addk_i32 s3, 0x300
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT:    s_and_b32 s4, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s27, 8
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v31
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v32
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v38
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    s_or_b32 s3, s3, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v33
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 3, v22
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v24
+; GFX11-NEXT:    v_or_b32_e32 v0, v83, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v84, v1
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_or_b32_e32 v2, v85, v2
+; GFX11-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_or_b32_e32 v6, v82, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, v68, v10
+; GFX11-NEXT:    v_or_b32_e32 v11, v69, v11
+; GFX11-NEXT:    s_and_b32 s6, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s7, s29, 8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_or_b32 s5, s7, s6
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-NEXT:    v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v35
+; GFX11-NEXT:    v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v34
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v36
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v37
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v16
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v18
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v20
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v52
+; GFX11-NEXT:    v_or_b32_e32 v0, v70, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v71, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v80, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v81, v3
+; GFX11-NEXT:    v_or_b32_e32 v6, v65, v6
+; GFX11-NEXT:    v_or_b32_e32 v7, v66, v7
+; GFX11-NEXT:    v_or_b32_e32 v9, v67, v9
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x300, v9
+; GFX11-NEXT:    v_or_b32_e32 v11, v64, v11
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT:    v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v26
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v28
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v30
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v53
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v51
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v50
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 3, v49
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 3, v48
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 3, v39
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT:    v_or_b32_e32 v0, v27, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v29, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v54, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, v55, v3
+; GFX11-NEXT:    v_or_b32_e32 v11, v17, v11
+; GFX11-NEXT:    v_or_b32_e32 v12, v19, v12
+; GFX11-NEXT:    v_or_b32_e32 v14, v21, v14
+; GFX11-NEXT:    v_or_b32_e32 v15, v23, v15
+; GFX11-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x300, v14
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT:    v_or_b32_e32 v13, v13, v17
+; GFX11-NEXT:    v_or_b32_e32 v14, v18, v14
+; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:  .LBB87_3: ; %end
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-NEXT:  .LBB87_4:
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT:    s_branch .LBB87_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -68419,45 +66879,45 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v80, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v39, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v51, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v81, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v49, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v37, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v52, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v82, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v50, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v83, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v34, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v35, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v36, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v38, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v84, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v85, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v86, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v70, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v54, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v71, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v64, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v67, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v66, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v69, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v53, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v65, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v55, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v68, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v25.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v17.l
@@ -68483,47 +66943,42 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v25.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v26.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.l, 8, v31.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v28.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v29.h
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v29.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v39.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v39.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.l, 8, v37.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v48.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v36.h, 8, v82.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.l, 8, v38.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v84.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v85.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v36.l, 8, v86.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v87.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v96.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v64.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.l, 8, v97.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v65.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v66
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v98.l
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB98_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -68551,22 +67006,22 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v68.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v67.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v48.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v23.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v23.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v20.l
@@ -68578,27 +67033,27 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v25.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v26.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v30.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v31.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v28.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v33.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v29.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v49.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v39.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v48.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v48.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v51.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v52.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v50.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v54.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v51.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v54.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v53.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v53.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v55.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v55.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v34.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v37.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v39.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
@@ -68615,22 +67070,22 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
@@ -68642,46 +67097,46 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB98_2
 ; GFX11-TRUE16-NEXT:  .LBB98_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v51.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v50.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v71.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v55.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v55.h, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v53.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v53.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v39.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v39.h, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v70.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v37.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v37.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
@@ -68689,46 +67144,46 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v64.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v36.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v38.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v69.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v54.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v67.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v54.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v50.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v38.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v34.h, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v35.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v51.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v50.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v66.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v35.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v34.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v68.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v53.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v55.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v52.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v49.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v48.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v48.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v39.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v36.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v33.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v32.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v32.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v31.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v0.l
@@ -68744,7 +67199,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v49.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v33.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v29.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v28.h, v0.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v29.l, v1.l
@@ -68764,7 +67219,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v39.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v31.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v25.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v26.h, v0.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v27.h, v1.l
@@ -70501,695 +68956,350 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_branch .LBB99_2
 ;
-; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32i16_scalar:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
-; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v8, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v10, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v29
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v84
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB99_4
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v35
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s29, 8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v39
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v38
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v31
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v5, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v6, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v24
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v18
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v8, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v82
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v65
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v70
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v12, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v69
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xff, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v84
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v87, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v96, v12, v81
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v97, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, v86, v85
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v98, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB99_3
-; GFX11-TRUE16-NEXT:  .LBB99_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v68
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v67
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v30
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v65
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v16
-; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v70, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v71, v5
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v66, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v28
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v29, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v25, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v23, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v21, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v55, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v19, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v54, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v17, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v36
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v53, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v33
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v31
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v52, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v51, v4
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v50, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v49, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v35
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v82
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v80
-; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v64
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v84, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v83, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v48, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v85, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:  .LBB99_3: ; %end
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB99_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_branch .LBB99_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v32i16_scalar:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
-; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v29
-; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v84
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB99_4
-; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v35
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s29, 8
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v39
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s10, s11
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-FAKE16-NEXT:    v_and_b32_e64 v1, 0xffff, s10
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v38
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v31
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v37
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v48
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v36
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v49
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v51
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v5, v50
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v6, v52
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v24
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v68
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v18
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v54
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v55
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v67
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v19
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v8, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v9, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v26
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v30
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v80
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v82
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v28
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v27
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v66
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v70
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v12, v71
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v69
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v83
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xff, v65
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v84
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v87, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v96, v12, v81
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v97, 0xffff, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v86, v86, v85
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff, v14
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB99_3
-; GFX11-FAKE16-NEXT:  .LBB99_2: ; %cmp.true
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v68
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v67
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v30
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v64
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v16
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v70, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v71, v5
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v66, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v26
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v28
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v29, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v24
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v27, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v25, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v23, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v18
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v21, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v55, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v19, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v54, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v17, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v36
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v53, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v33
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v31
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 3, v37
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v52, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 3, v38
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v51, v4
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v50, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v49, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v35
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v82
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v80
-; GFX11-FAKE16-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v65
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v69
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v84, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v83, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v48, v8
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v39, v4
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v85, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT:  .LBB99_3: ; %end
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB99_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT:    s_branch .LBB99_2
+; GFX11-LABEL: bitcast_v64i8_to_v32i16_scalar:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
+; GFX11-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
+; GFX11-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
+; GFX11-NEXT:    v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u16 v4, off, s32
+; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u16 v82, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 8, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 8, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 8, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 8, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 8, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 8, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 8, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 8, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 8, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 8, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 8, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 8, v29
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 8, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 8, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 8, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 8, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 8, v14
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 8, v84
+; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cbranch_scc0 .LBB99_4
+; GFX11-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v35
+; GFX11-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_and_b32 s10, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s11, s29, 8
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v39
+; GFX11-NEXT:    s_or_b32 s10, s10, s11
+; GFX11-NEXT:    s_and_b32 s9, s26, 0xff
+; GFX11-NEXT:    v_and_b32_e64 v1, 0xffff, s10
+; GFX11-NEXT:    s_lshl_b32 s12, s27, 8
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v38
+; GFX11-NEXT:    s_or_b32 s9, s9, s12
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v31
+; GFX11-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v37
+; GFX11-NEXT:    v_lshl_or_b32 v4, v0, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v33
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v48
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v36
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v49
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v32
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v51
+; GFX11-NEXT:    v_or_b32_e32 v7, v5, v50
+; GFX11-NEXT:    v_or_b32_e32 v8, v6, v52
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v53
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v24
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v68
+; GFX11-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v34
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v16
+; GFX11-NEXT:    v_lshl_or_b32 v6, v7, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v18
+; GFX11-NEXT:    v_lshl_or_b32 v7, v8, 16, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v54
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v17
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v20
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v22
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v55
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v67
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v19
+; GFX11-NEXT:    v_or_b32_e32 v10, v8, v23
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v8, v2, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v9, v21
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v26
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT:    v_lshl_or_b32 v9, v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v30
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v80
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v82
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v28
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v27
+; GFX11-NEXT:    v_lshl_or_b32 v10, v1, 16, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v64
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v66
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v70
+; GFX11-NEXT:    v_or_b32_e32 v15, v12, v71
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v69
+; GFX11-NEXT:    v_or_b32_e32 v13, v13, v83
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xff, v65
+; GFX11-NEXT:    v_or_b32_e32 v14, v14, v84
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v25
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v29
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_and_b32_e32 v87, 0xffff, v11
+; GFX11-NEXT:    v_or_b32_e32 v96, v12, v81
+; GFX11-NEXT:    v_and_b32_e32 v97, 0xffff, v13
+; GFX11-NEXT:    v_or_b32_e32 v86, v86, v85
+; GFX11-NEXT:    v_and_b32_e32 v98, 0xffff, v14
+; GFX11-NEXT:    v_lshl_or_b32 v11, v0, 16, v2
+; GFX11-NEXT:    v_lshl_or_b32 v12, v1, 16, v3
+; GFX11-NEXT:    v_lshl_or_b32 v13, v15, 16, v87
+; GFX11-NEXT:    v_lshl_or_b32 v14, v96, 16, v97
+; GFX11-NEXT:    v_lshl_or_b32 v15, v86, 16, v98
+; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_vccnz .LBB99_3
+; GFX11-NEXT:  .LBB99_2: ; %cmp.true
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v68
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v67
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v30
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v64
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v22
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v16
+; GFX11-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-NEXT:    v_or_b32_e32 v4, v70, v4
+; GFX11-NEXT:    v_or_b32_e32 v5, v71, v5
+; GFX11-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v4, v66, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v26
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v28
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v29, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v24
+; GFX11-NEXT:    v_or_b32_e32 v4, v27, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v25, v6
+; GFX11-NEXT:    v_or_b32_e32 v6, v23, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v18
+; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x300, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v34
+; GFX11-NEXT:    v_or_b32_e32 v5, v21, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v20
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v7
+; GFX11-NEXT:    v_or_b32_e32 v4, v55, v4
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v32
+; GFX11-NEXT:    v_or_b32_e32 v5, v19, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x300, v4
+; GFX11-NEXT:    v_or_b32_e32 v4, v54, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v17, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v36
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v19, 0x300, v7
+; GFX11-NEXT:    v_or_b32_e32 v4, v53, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v33
+; GFX11-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v31
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 3, v37
+; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v7
+; GFX11-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    v_or_b32_e32 v5, v52, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v20
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 3, v38
+; GFX11-NEXT:    v_or_b32_e32 v4, v51, v4
+; GFX11-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v50, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v49, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v20
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v35
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v82
+; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v80
+; GFX11-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v65
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v69
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_or_b32_e32 v0, v84, v0
+; GFX11-NEXT:    v_or_b32_e32 v2, v83, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x300, v5
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v48, v8
+; GFX11-NEXT:    v_or_b32_e32 v4, v39, v4
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    v_or_b32_e32 v1, v85, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_or_b32_e32 v3, v81, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e64 v8, 0xffff, s4
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
+; GFX11-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v20
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v21
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v6
+; GFX11-NEXT:    v_lshl_or_b32 v9, v9, 16, v16
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v11
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v6, v23, 16, v7
+; GFX11-NEXT:    v_lshl_or_b32 v7, v22, 16, v8
+; GFX11-NEXT:    v_lshl_or_b32 v8, v19, 16, v17
+; GFX11-NEXT:    v_lshl_or_b32 v10, v10, 16, v20
+; GFX11-NEXT:    v_lshl_or_b32 v11, v18, 16, v15
+; GFX11-NEXT:    v_lshl_or_b32 v12, v14, 16, v12
+; GFX11-NEXT:    v_lshl_or_b32 v13, v13, 16, v16
+; GFX11-NEXT:    v_lshl_or_b32 v14, v3, 16, v2
+; GFX11-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:  .LBB99_3: ; %end
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-NEXT:  .LBB99_4:
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT:    s_branch .LBB99_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -80636,45 +78746,45 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v80, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v39, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v51, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v81, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v49, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v37, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v52, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v82, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v50, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v83, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v34, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v35, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v36, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v38, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v84, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v85, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v86, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v70, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v54, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v71, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v64, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v67, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v66, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v69, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v53, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v65, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v55, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v68, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v25.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v17.l
@@ -80700,47 +78810,42 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v25.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v26.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.l, 8, v31.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v28.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v29.h
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v29.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v39.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v39.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.l, 8, v37.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v48.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v36.h, 8, v82.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.l, 8, v38.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v84.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v85.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v36.l, 8, v86.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v87.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v96.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v64.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.l, 8, v97.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v65.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v66
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v98.l
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB106_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -80768,22 +78873,22 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v68.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v67.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v48.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v23.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v23.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v20.l
@@ -80795,27 +78900,27 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v25.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v26.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v30.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v31.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v28.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v33.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v29.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v49.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v39.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v48.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v48.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v51.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v52.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v50.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v54.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v51.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v54.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v53.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v53.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v55.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v55.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v34.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v37.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v39.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
@@ -80832,22 +78937,22 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
@@ -80859,46 +78964,46 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB106_2
 ; GFX11-TRUE16-NEXT:  .LBB106_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v51.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v50.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v71.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v55.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v55.h, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v53.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v53.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v39.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v39.h, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v70.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v37.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v37.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
@@ -80906,46 +79011,46 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v64.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v36.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v38.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v69.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v54.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v67.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v54.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v50.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v38.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v34.h, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v35.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v51.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v50.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v66.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v35.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v34.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v68.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v53.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v55.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v52.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v49.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v48.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v48.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v39.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v36.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v33.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v32.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v32.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v31.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v0.l
@@ -80961,7 +79066,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v49.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v33.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v29.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v28.h, v0.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v29.l, v1.l
@@ -80981,7 +79086,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v39.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v31.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v25.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v26.h, v0.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v27.h, v1.l
@@ -82684,695 +80789,350 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_branch .LBB107_2
 ;
-; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32f16_scalar:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
-; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v8, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v10, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v29
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v84
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB107_4
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v35
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s29, 8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v39
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v38
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v31
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v5, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v6, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v24
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v18
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v8, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v82
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v65
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v70
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v12, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v69
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xff, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v84
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v87, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v96, v12, v81
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v97, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, v86, v85
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v98, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB107_3
-; GFX11-TRUE16-NEXT:  .LBB107_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v68
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v67
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v30
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v65
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v16
-; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v70, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v71, v5
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v66, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v28
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v29, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v25, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v23, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v21, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v55, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v19, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v54, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v17, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v36
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v53, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v33
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v31
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v52, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v51, v4
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v50, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v49, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v35
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v82
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v80
-; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v64
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v84, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v83, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v48, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v85, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:  .LBB107_3: ; %end
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB107_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_branch .LBB107_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v32f16_scalar:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
-; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v29
-; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v84
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB107_4
-; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v35
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s29, 8
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v39
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s10, s11
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-FAKE16-NEXT:    v_and_b32_e64 v1, 0xffff, s10
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v38
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v31
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v37
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v48
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v36
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v49
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v51
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v5, v50
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v6, v52
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v24
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v68
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v18
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v54
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v55
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v67
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v19
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v8, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v9, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v26
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v30
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v80
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v82
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v28
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v27
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v66
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v70
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v12, v71
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v69
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v83
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xff, v65
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v84
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v87, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v96, v12, v81
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v97, 0xffff, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v86, v86, v85
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff, v14
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB107_3
-; GFX11-FAKE16-NEXT:  .LBB107_2: ; %cmp.true
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v68
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v67
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v30
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v64
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v16
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v70, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v71, v5
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v66, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v26
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v28
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v29, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v24
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v27, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v25, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v23, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v18
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v21, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v55, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v19, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v54, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v17, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v36
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v53, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v33
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v31
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 3, v37
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v52, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 3, v38
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v51, v4
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v50, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v49, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v35
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v82
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v80
-; GFX11-FAKE16-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v65
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v69
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v84, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v83, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v48, v8
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v39, v4
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v85, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT:  .LBB107_3: ; %end
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB107_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT:    s_branch .LBB107_2
+; GFX11-LABEL: bitcast_v64i8_to_v32f16_scalar:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
+; GFX11-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
+; GFX11-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
+; GFX11-NEXT:    v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u16 v4, off, s32
+; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u16 v82, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 8, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 8, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 8, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 8, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 8, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 8, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 8, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 8, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 8, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 8, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 8, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 8, v29
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 8, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 8, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 8, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 8, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 8, v14
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 8, v84
+; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cbranch_scc0 .LBB107_4
+; GFX11-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v35
+; GFX11-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_and_b32 s10, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s11, s29, 8
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v39
+; GFX11-NEXT:    s_or_b32 s10, s10, s11
+; GFX11-NEXT:    s_and_b32 s9, s26, 0xff
+; GFX11-NEXT:    v_and_b32_e64 v1, 0xffff, s10
+; GFX11-NEXT:    s_lshl_b32 s12, s27, 8
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v38
+; GFX11-NEXT:    s_or_b32 s9, s9, s12
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v31
+; GFX11-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v37
+; GFX11-NEXT:    v_lshl_or_b32 v4, v0, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v33
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v48
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v36
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v49
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v32
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v51
+; GFX11-NEXT:    v_or_b32_e32 v7, v5, v50
+; GFX11-NEXT:    v_or_b32_e32 v8, v6, v52
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v53
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v24
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v68
+; GFX11-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v34
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v16
+; GFX11-NEXT:    v_lshl_or_b32 v6, v7, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v18
+; GFX11-NEXT:    v_lshl_or_b32 v7, v8, 16, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v54
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v17
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v20
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v22
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v55
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v67
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v19
+; GFX11-NEXT:    v_or_b32_e32 v10, v8, v23
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v8, v2, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v9, v21
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v26
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT:    v_lshl_or_b32 v9, v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v30
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v80
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v82
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v28
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v27
+; GFX11-NEXT:    v_lshl_or_b32 v10, v1, 16, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v64
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v66
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v70
+; GFX11-NEXT:    v_or_b32_e32 v15, v12, v71
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v69
+; GFX11-NEXT:    v_or_b32_e32 v13, v13, v83
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xff, v65
+; GFX11-NEXT:    v_or_b32_e32 v14, v14, v84
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v25
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v29
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_and_b32_e32 v87, 0xffff, v11
+; GFX11-NEXT:    v_or_b32_e32 v96, v12, v81
+; GFX11-NEXT:    v_and_b32_e32 v97, 0xffff, v13
+; GFX11-NEXT:    v_or_b32_e32 v86, v86, v85
+; GFX11-NEXT:    v_and_b32_e32 v98, 0xffff, v14
+; GFX11-NEXT:    v_lshl_or_b32 v11, v0, 16, v2
+; GFX11-NEXT:    v_lshl_or_b32 v12, v1, 16, v3
+; GFX11-NEXT:    v_lshl_or_b32 v13, v15, 16, v87
+; GFX11-NEXT:    v_lshl_or_b32 v14, v96, 16, v97
+; GFX11-NEXT:    v_lshl_or_b32 v15, v86, 16, v98
+; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_vccnz .LBB107_3
+; GFX11-NEXT:  .LBB107_2: ; %cmp.true
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v68
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v67
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v30
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v64
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v22
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v16
+; GFX11-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-NEXT:    v_or_b32_e32 v4, v70, v4
+; GFX11-NEXT:    v_or_b32_e32 v5, v71, v5
+; GFX11-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v4, v66, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v26
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v28
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v29, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v24
+; GFX11-NEXT:    v_or_b32_e32 v4, v27, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v25, v6
+; GFX11-NEXT:    v_or_b32_e32 v6, v23, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v18
+; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x300, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v34
+; GFX11-NEXT:    v_or_b32_e32 v5, v21, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v20
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v7
+; GFX11-NEXT:    v_or_b32_e32 v4, v55, v4
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v32
+; GFX11-NEXT:    v_or_b32_e32 v5, v19, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x300, v4
+; GFX11-NEXT:    v_or_b32_e32 v4, v54, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v17, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v36
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v19, 0x300, v7
+; GFX11-NEXT:    v_or_b32_e32 v4, v53, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v33
+; GFX11-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v31
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 3, v37
+; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v7
+; GFX11-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    v_or_b32_e32 v5, v52, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v20
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 3, v38
+; GFX11-NEXT:    v_or_b32_e32 v4, v51, v4
+; GFX11-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v50, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v49, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v20
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v35
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v82
+; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v80
+; GFX11-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v65
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v69
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_or_b32_e32 v0, v84, v0
+; GFX11-NEXT:    v_or_b32_e32 v2, v83, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x300, v5
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v48, v8
+; GFX11-NEXT:    v_or_b32_e32 v4, v39, v4
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    v_or_b32_e32 v1, v85, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_or_b32_e32 v3, v81, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e64 v8, 0xffff, s4
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
+; GFX11-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v20
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v21
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v6
+; GFX11-NEXT:    v_lshl_or_b32 v9, v9, 16, v16
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v11
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v6, v23, 16, v7
+; GFX11-NEXT:    v_lshl_or_b32 v7, v22, 16, v8
+; GFX11-NEXT:    v_lshl_or_b32 v8, v19, 16, v17
+; GFX11-NEXT:    v_lshl_or_b32 v10, v10, 16, v20
+; GFX11-NEXT:    v_lshl_or_b32 v11, v18, 16, v15
+; GFX11-NEXT:    v_lshl_or_b32 v12, v14, 16, v12
+; GFX11-NEXT:    v_lshl_or_b32 v13, v13, 16, v16
+; GFX11-NEXT:    v_lshl_or_b32 v14, v3, 16, v2
+; GFX11-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:  .LBB107_3: ; %end
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-NEXT:  .LBB107_4:
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT:    s_branch .LBB107_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -91142,45 +88902,45 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v80, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v48, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v39, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v51, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v81, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v49, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v37, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v52, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v82, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v50, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v83, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v34, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v35, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v36, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v38, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v84, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v85, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v86, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v70, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v54, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v71, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v64, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v67, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v66, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v69, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v53, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v65, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v55, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v68, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v25.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v17.l
@@ -91206,47 +88966,42 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v25.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v26.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v30.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.l, 8, v31.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v28.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v29.h
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v29.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v39.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v39.h
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v37.l, 8, v37.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v48.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v36.h, 8, v82.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.l, 8, v38.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v84.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v85.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v36.l, 8, v86.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v87.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v96.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v64.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v38.l, 8, v97.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v65.l
-; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v66
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v98.l
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB110_3
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
@@ -91274,22 +89029,22 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v68.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v67.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v48.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v23.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v23.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v20.l
@@ -91301,27 +89056,27 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v25.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v26.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v30.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v31.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v28.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v29.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v33.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v29.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v49.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v39.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v48.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v48.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v51.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v52.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v50.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v50.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v54.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v51.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v54.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v52.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v53.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v53.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v55.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v55.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v34.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v36.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v37.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v39.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
@@ -91338,22 +89093,22 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
@@ -91365,46 +89120,46 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB110_2
 ; GFX11-TRUE16-NEXT:  .LBB110_4: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v33.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v32.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v31.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v51.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v50.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v71.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v55.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v55.h, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v38.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v53.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v53.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v39.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v39.h, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v70.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v37.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v37.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
@@ -91412,46 +89167,46 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.h, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v64.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v36.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v38.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v69.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v36.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v54.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v67.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v54.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v50.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v38.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v34.h, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v35.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v36.l, 3
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v51.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v50.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v66.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v35.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v34.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v68.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v0.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v34.h, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v34.l, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v53.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v55.l, 3
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v52.l, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v49.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v48.l, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v48.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v39.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v36.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v33.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v32.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v32.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v31.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v2.l
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.l, 3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v0.l
@@ -91467,7 +89222,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v49.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v33.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v29.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v28.h, v0.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v29.l, v1.l
@@ -91487,7 +89242,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v39.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v31.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v25.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v26.h, v0.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v27.h, v1.l
@@ -93194,695 +90949,350 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_branch .LBB111_2
 ;
-; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32bf16_scalar:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
-; GFX11-TRUE16-NEXT:    s_clause 0xf
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v4, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v6, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v8, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v10, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v12, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v14, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v29
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v84
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB111_4
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v35
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s29, 8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v39
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v38
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v31
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v5, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v6, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v24
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v18
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v8, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v82
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v10
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v65
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v70
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v12, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v69
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xff, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v84
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v87, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v96, v12, v81
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v97, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, v86, v85
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v98, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB111_3
-; GFX11-TRUE16-NEXT:  .LBB111_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v68
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v67
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v30
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v65
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v16
-; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v70, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v71, v5
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v66, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v28
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v29, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v25, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v23, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v21, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v55, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v19, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v54, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v17, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v36
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v53, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v33
-; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v31
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v52, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v51, v4
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v50, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v49, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v35
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v82
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v80
-; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v64
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v84, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v83, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v48, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v85, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:  .LBB111_3: ; %end
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT:  .LBB111_4:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT:    s_branch .LBB111_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v32bf16_scalar:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
-; GFX11-FAKE16-NEXT:    s_clause 0xf
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 8, v3
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 8, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 8, v7
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 8, v9
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 8, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 8, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 8, v15
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 8, v19
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v21
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v25
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 8, v29
-; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 8, v84
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB111_4
-; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v35
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s29, 8
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v39
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s10, s11
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-FAKE16-NEXT:    v_and_b32_e64 v1, 0xffff, s10
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v38
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v31
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v37
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v33
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v48
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v36
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v49
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v51
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v5, v50
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v6, v52
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v53
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v24
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v68
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v34
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v16
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v18
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v54
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v55
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v67
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v19
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v8, v23
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v9, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v26
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v30
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v80
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v82
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v28
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v27
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v10
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v66
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v70
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v12, v71
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v69
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v83
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xff, v65
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v84
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v25
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v87, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v96, v12, v81
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v97, 0xffff, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v86, v86, v85
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v98, 0xffff, v14
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB111_3
-; GFX11-FAKE16-NEXT:  .LBB111_2: ; %cmp.true
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v68
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v67
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v30
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v64
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v16
-; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v70, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v71, v5
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v66, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v26
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v28
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v29, v6
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v24
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v27, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v25, v6
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v23, v7
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v18
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v21, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v55, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v32
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v19, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v54, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v17, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v36
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v53, v8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v33
-; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v31
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 3, v37
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-FAKE16-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-FAKE16-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-FAKE16-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v52, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 3, v38
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v51, v4
-; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v50, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v49, v8
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v35
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v82
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v80
-; GFX11-FAKE16-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-FAKE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v65
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v69
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v84, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v83, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v48, v8
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v39, v4
-; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v85, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v11
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT:  .LBB111_3: ; %end
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT:  .LBB111_4:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT:    s_branch .LBB111_2
+; GFX11-LABEL: bitcast_v64i8_to_v32bf16_scalar:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
+; GFX11-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
+; GFX11-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
+; GFX11-NEXT:    v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u16 v4, off, s32
+; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u16 v82, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 8, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 8, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 8, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 8, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 8, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 8, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 8, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 8, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 8, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 8, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 8, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 8, v29
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 8, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 8, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 8, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 8, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 8, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 8, v14
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 8, v84
+; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cbranch_scc0 .LBB111_4
+; GFX11-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v35
+; GFX11-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-NEXT:    s_and_b32 s10, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s11, s29, 8
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v39
+; GFX11-NEXT:    s_or_b32 s10, s10, s11
+; GFX11-NEXT:    s_and_b32 s9, s26, 0xff
+; GFX11-NEXT:    v_and_b32_e64 v1, 0xffff, s10
+; GFX11-NEXT:    s_lshl_b32 s12, s27, 8
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v38
+; GFX11-NEXT:    s_or_b32 s9, s9, s12
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v31
+; GFX11-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v37
+; GFX11-NEXT:    v_lshl_or_b32 v4, v0, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v33
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v48
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v36
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v49
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v32
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v51
+; GFX11-NEXT:    v_or_b32_e32 v7, v5, v50
+; GFX11-NEXT:    v_or_b32_e32 v8, v6, v52
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v53
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v24
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v68
+; GFX11-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v34
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v16
+; GFX11-NEXT:    v_lshl_or_b32 v6, v7, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v18
+; GFX11-NEXT:    v_lshl_or_b32 v7, v8, 16, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v54
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v17
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v20
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v22
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v55
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v67
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v19
+; GFX11-NEXT:    v_or_b32_e32 v10, v8, v23
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v8, v2, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v9, v21
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v26
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT:    v_lshl_or_b32 v9, v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v30
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v80
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v82
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v28
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v27
+; GFX11-NEXT:    v_lshl_or_b32 v10, v1, 16, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v64
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v66
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v70
+; GFX11-NEXT:    v_or_b32_e32 v15, v12, v71
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v69
+; GFX11-NEXT:    v_or_b32_e32 v13, v13, v83
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xff, v65
+; GFX11-NEXT:    v_or_b32_e32 v14, v14, v84
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v25
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v29
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT:    v_and_b32_e32 v87, 0xffff, v11
+; GFX11-NEXT:    v_or_b32_e32 v96, v12, v81
+; GFX11-NEXT:    v_and_b32_e32 v97, 0xffff, v13
+; GFX11-NEXT:    v_or_b32_e32 v86, v86, v85
+; GFX11-NEXT:    v_and_b32_e32 v98, 0xffff, v14
+; GFX11-NEXT:    v_lshl_or_b32 v11, v0, 16, v2
+; GFX11-NEXT:    v_lshl_or_b32 v12, v1, 16, v3
+; GFX11-NEXT:    v_lshl_or_b32 v13, v15, 16, v87
+; GFX11-NEXT:    v_lshl_or_b32 v14, v96, 16, v97
+; GFX11-NEXT:    v_lshl_or_b32 v15, v86, 16, v98
+; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_vccnz .LBB111_3
+; GFX11-NEXT:  .LBB111_2: ; %cmp.true
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v68
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v67
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v30
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v64
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v22
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v16
+; GFX11-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-NEXT:    v_or_b32_e32 v4, v70, v4
+; GFX11-NEXT:    v_or_b32_e32 v5, v71, v5
+; GFX11-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v4, v66, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v26
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v28
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v29, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v24
+; GFX11-NEXT:    v_or_b32_e32 v4, v27, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v25, v6
+; GFX11-NEXT:    v_or_b32_e32 v6, v23, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v18
+; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x300, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v34
+; GFX11-NEXT:    v_or_b32_e32 v5, v21, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v20
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v7
+; GFX11-NEXT:    v_or_b32_e32 v4, v55, v4
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v32
+; GFX11-NEXT:    v_or_b32_e32 v5, v19, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x300, v4
+; GFX11-NEXT:    v_or_b32_e32 v4, v54, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v17, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v36
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v19, 0x300, v7
+; GFX11-NEXT:    v_or_b32_e32 v4, v53, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v33
+; GFX11-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v31
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 3, v37
+; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v7
+; GFX11-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    v_or_b32_e32 v5, v52, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v20
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 3, v38
+; GFX11-NEXT:    v_or_b32_e32 v4, v51, v4
+; GFX11-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x300, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v50, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v49, v8
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v20
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x300, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v35
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v82
+; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v80
+; GFX11-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v65
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v69
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_or_b32_e32 v0, v84, v0
+; GFX11-NEXT:    v_or_b32_e32 v2, v83, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x300, v5
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
+; GFX11-NEXT:    v_or_b32_e32 v7, v48, v8
+; GFX11-NEXT:    v_or_b32_e32 v4, v39, v4
+; GFX11-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-NEXT:    v_or_b32_e32 v1, v85, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT:    v_or_b32_e32 v3, v81, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT:    v_and_b32_e64 v8, 0xffff, s4
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
+; GFX11-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v20
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v21
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v6
+; GFX11-NEXT:    v_lshl_or_b32 v9, v9, 16, v16
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v11
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshl_or_b32 v6, v23, 16, v7
+; GFX11-NEXT:    v_lshl_or_b32 v7, v22, 16, v8
+; GFX11-NEXT:    v_lshl_or_b32 v8, v19, 16, v17
+; GFX11-NEXT:    v_lshl_or_b32 v10, v10, 16, v20
+; GFX11-NEXT:    v_lshl_or_b32 v11, v18, 16, v15
+; GFX11-NEXT:    v_lshl_or_b32 v12, v14, 16, v12
+; GFX11-NEXT:    v_lshl_or_b32 v13, v13, 16, v16
+; GFX11-NEXT:    v_lshl_or_b32 v14, v3, 16, v2
+; GFX11-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:  .LBB111_3: ; %end
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-NEXT:  .LBB111_4:
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT:    s_branch .LBB111_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 10e523d1a0cf1..c81d847896476 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -61,21 +61,13 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GFX10-NEXT:    global_store_short v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11TRUE16-LABEL: test_load_store:
-; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: test_load_store:
-; GFX11FAKE16:       ; %bb.0:
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11FAKE16-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_load_store:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load bfloat, ptr addrspace(1) %in
   store bfloat %val, ptr addrspace(1) %out
   ret void
@@ -3652,21 +3644,13 @@ define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %ou
 ; GFX10-NEXT:    global_store_short v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11TRUE16-LABEL: test_bitcast_from_bfloat:
-; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: test_bitcast_from_bfloat:
-; GFX11FAKE16:       ; %bb.0:
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11FAKE16-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_bitcast_from_bfloat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load bfloat, ptr addrspace(1) %in
   %val_int = bitcast bfloat %val to i16
   store i16 %val_int, ptr addrspace(1) %out
@@ -3726,21 +3710,13 @@ define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11TRUE16-LABEL: test_bitcast_to_bfloat:
-; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    global_load_d16_b16 v2, v[2:3], off
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: test_bitcast_to_bfloat:
-; GFX11FAKE16:       ; %bb.0:
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT:    global_load_u16 v2, v[2:3], off
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_bitcast_to_bfloat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v2, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i16, ptr addrspace(1) %in
   %val_fp = bitcast i16 %val to bfloat
   store bfloat %val_fp, ptr addrspace(1) %out
@@ -5676,23 +5652,14 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11TRUE16-LABEL: test_alloca_load_store_ret:
-; GFX11TRUE16:       ; %bb.0: ; %entry
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
-; GFX11TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: test_alloca_load_store_ret:
-; GFX11FAKE16:       ; %bb.0: ; %entry
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
-; GFX11FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11FAKE16-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_alloca_load_store_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_store_b16 off, v0, s32 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %in.addr = alloca bfloat, align 2, addrspace(5)
   store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2
@@ -45726,34 +45693,34 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11TRUE16-NEXT:    s_clause 0x1f
-; GFX11TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11TRUE16-NEXT:    scratch_load_u16 v31, off, s32
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:68
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:72
-; GFX11TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:124
-; GFX11TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:128
-; GFX11TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:64
-; GFX11TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:60
-; GFX11TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:120
-; GFX11TRUE16-NEXT:    scratch_load_b32 v39, off, s32 offset:56
-; GFX11TRUE16-NEXT:    scratch_load_b32 v48, off, s32 offset:116
-; GFX11TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:52
-; GFX11TRUE16-NEXT:    scratch_load_b32 v50, off, s32 offset:112
-; GFX11TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:48
-; GFX11TRUE16-NEXT:    scratch_load_b32 v52, off, s32 offset:108
-; GFX11TRUE16-NEXT:    scratch_load_b32 v53, off, s32 offset:44
-; GFX11TRUE16-NEXT:    scratch_load_b32 v54, off, s32 offset:104
-; GFX11TRUE16-NEXT:    scratch_load_b32 v55, off, s32 offset:40
-; GFX11TRUE16-NEXT:    scratch_load_b32 v64, off, s32 offset:100
-; GFX11TRUE16-NEXT:    scratch_load_b32 v65, off, s32 offset:36
-; GFX11TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:96
-; GFX11TRUE16-NEXT:    scratch_load_b32 v67, off, s32 offset:32
-; GFX11TRUE16-NEXT:    scratch_load_b32 v68, off, s32 offset:92
-; GFX11TRUE16-NEXT:    scratch_load_b32 v69, off, s32 offset:28
-; GFX11TRUE16-NEXT:    scratch_load_b32 v70, off, s32 offset:88
-; GFX11TRUE16-NEXT:    scratch_load_b32 v71, off, s32 offset:24
-; GFX11TRUE16-NEXT:    scratch_load_b32 v80, off, s32 offset:84
-; GFX11TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:20
-; GFX11TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:76
+; GFX11TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:76
+; GFX11TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:124
+; GFX11TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:128
+; GFX11TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:64
+; GFX11TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:60
+; GFX11TRUE16-NEXT:    scratch_load_b32 v39, off, s32 offset:120
+; GFX11TRUE16-NEXT:    scratch_load_b32 v48, off, s32 offset:56
+; GFX11TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:116
+; GFX11TRUE16-NEXT:    scratch_load_b32 v50, off, s32 offset:52
+; GFX11TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:112
+; GFX11TRUE16-NEXT:    scratch_load_b32 v52, off, s32 offset:48
+; GFX11TRUE16-NEXT:    scratch_load_b32 v53, off, s32 offset:108
+; GFX11TRUE16-NEXT:    scratch_load_b32 v54, off, s32 offset:44
+; GFX11TRUE16-NEXT:    scratch_load_b32 v55, off, s32 offset:104
+; GFX11TRUE16-NEXT:    scratch_load_b32 v64, off, s32 offset:40
+; GFX11TRUE16-NEXT:    scratch_load_b32 v65, off, s32 offset:100
+; GFX11TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:36
+; GFX11TRUE16-NEXT:    scratch_load_b32 v67, off, s32 offset:96
+; GFX11TRUE16-NEXT:    scratch_load_b32 v68, off, s32 offset:32
+; GFX11TRUE16-NEXT:    scratch_load_b32 v69, off, s32 offset:92
+; GFX11TRUE16-NEXT:    scratch_load_b32 v70, off, s32 offset:28
+; GFX11TRUE16-NEXT:    scratch_load_b32 v71, off, s32 offset:88
+; GFX11TRUE16-NEXT:    scratch_load_b32 v80, off, s32 offset:24
+; GFX11TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:84
+; GFX11TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:20
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v83, off, s32 offset:80
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v84, off, s32 offset:16
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v85, off, s32 offset:12
@@ -45823,45 +45790,45 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s29, 1, v26
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 1, v31
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v15.l, v35.l, v36.l, s26
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(26)
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v14.l, v34.l, v37.l, s27
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v14.h, v34.h, v37.h, s28
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(24)
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v13.l, v38.l, v39.l, s29
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v13.h, v38.h, v39.h, s25
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(22)
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v12.l, v48.l, v49.l, s24
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v12.h, v48.h, v49.h, s23
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v11.l, v50.l, v51.l, s22
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v11.h, v50.h, v51.h, s21
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v10.l, v52.l, v53.l, s20
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v10.h, v52.h, v53.h, s19
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v9.l, v54.l, v55.l, s18
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v9.h, v54.h, v55.h, s17
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v8.l, v64.l, v65.l, s16
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v8.h, v64.h, v65.h, s15
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v7.l, v66.l, v67.l, s14
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v7.h, v66.h, v67.h, s13
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.l, v68.l, v69.l, s12
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.h, v68.h, v69.h, s11
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.l, v70.l, v71.l, s10
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.h, v70.h, v71.h, s9
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.l, v80.l, v81.l, s8
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.h, v80.h, v81.h, s7
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v15.l, v36.l, v37.l, s26
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v14.l, v35.l, v38.l, s27
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v14.h, v35.h, v38.h, s28
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(23)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v13.l, v39.l, v48.l, s29
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v13.h, v39.h, v48.h, s25
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v12.l, v49.l, v50.l, s24
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v12.h, v49.h, v50.h, s23
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v11.l, v51.l, v52.l, s22
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v11.h, v51.h, v52.h, s21
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v10.l, v53.l, v54.l, s20
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v10.h, v53.h, v54.h, s19
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v9.l, v55.l, v64.l, s18
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v9.h, v55.h, v64.h, s17
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v8.l, v65.l, v66.l, s16
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v8.h, v65.h, v66.h, s15
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v7.l, v67.l, v68.l, s14
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v7.h, v67.h, v68.h, s13
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.l, v69.l, v70.l, s12
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.h, v69.h, v70.h, s11
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.l, v71.l, v80.l, s10
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.h, v71.h, v80.h, s9
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.l, v81.l, v82.l, s8
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.h, v81.h, v82.h, s7
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v83.l, v84.l, s6
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.l, v82.l, v85.l, s4
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.l, v34.l, v85.l, s4
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v33.l, v86.l, s2
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
@@ -45869,9 +45836,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v16
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v32.h, v87.h, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v33.h, v86.h, s1
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.h, v82.h, v85.h, s3
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.h, v34.h, v85.h, s3
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.h, v83.h, v84.h, s5
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v15.h, v35.h, v36.h, s0
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v15.h, v36.h, v37.h, s0
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v32bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll
index dd389375b0d77..2b6d9cc349278 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll
@@ -18,15 +18,15 @@ define amdgpu_kernel void @long_forward_branch_gfx11plus(ptr addrspace(1) %in, p
 ; GFX11-NEXT:    s_setpc_b64 s[6:7]
 ; GFX11-NEXT:  .LBB0_1: ; %bb2
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2
+; GFX11-NEXT:    global_load_u16 v0, v2, s[0:1]
+; GFX11-NEXT:    global_load_u16 v1, v2, s[0:1] offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    global_store_b16 v1, v0, s[2:3]
+; GFX11-NEXT:    global_store_b16 v2, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_d16_hi_b16 v1, v0, s[2:3] offset:2
+; GFX11-NEXT:    global_store_b16 v2, v1, s[2:3] offset:2
 ; GFX11-NEXT:  .LBB0_2: ; %bb3
 ; GFX11-NEXT:    s_endpgm
 bb0:
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index b8dd377377dab..0eab82778c8db 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -5091,7 +5091,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, off, off offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u8 v0, off, off offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, off offset:12
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index d4581672dab39..d07bce4ad45d5 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -48,25 +48,15 @@ define <2 x half> @chain_hi_to_lo_private() {
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_private:
-; GFX11-TRUE16:       ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 2
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v0, off, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_private:
-; GFX11-FAKE16:       ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 2
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT:    scratch_load_d16_hi_b16 v0, off, s0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_private:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, 2
+; GFX11-NEXT:    scratch_load_u16 v0, off, s0
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    scratch_load_d16_hi_b16 v0, off, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds half, ptr addrspace(5) null, i64 1
   %load_lo = load half, ptr addrspace(5) %gep_lo
@@ -114,21 +104,13 @@ define <2 x half> @chain_hi_to_lo_private_different_bases(ptr addrspace(5) %base
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_private_different_bases:
-; GFX11-TRUE16:       ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, v0, off
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v0, v1, off
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_private_different_bases:
-; GFX11-FAKE16:       ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, v0, off
-; GFX11-FAKE16-NEXT:    scratch_load_d16_hi_b16 v0, v1, off
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_private_different_bases:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_u16 v0, v0, off
+; GFX11-NEXT:    scratch_load_d16_hi_b16 v0, v1, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %load_lo = load half, ptr addrspace(5) %base_lo
   %load_hi = load half, ptr addrspace(5) %base_hi
@@ -325,29 +307,17 @@ define <2 x half> @chain_hi_to_lo_global() {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_global:
-; GFX11-TRUE16:       ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v[1:2], off
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_global:
-; GFX11-FAKE16:       ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 2
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-FAKE16-NEXT:    global_load_d16_hi_b16 v0, v[1:2], off
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_global:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[1:2], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds half, ptr addrspace(1) null, i64 1
   %load_lo = load half, ptr addrspace(1) %gep_lo
@@ -377,21 +347,13 @@ define <2 x half> @chain_hi_to_lo_global_different_bases(ptr addrspace(1) %base_
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_global_different_bases:
-; GFX11-TRUE16:       ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v[2:3], off
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_different_bases:
-; GFX11-FAKE16:       ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-FAKE16-NEXT:    global_load_d16_hi_b16 v0, v[2:3], off
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_global_different_bases:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %load_lo = load half, ptr addrspace(1) %base_lo
   %load_hi = load half, ptr addrspace(1) %base_hi
@@ -459,29 +421,17 @@ define <2 x half> @chain_hi_to_lo_flat(ptr inreg %ptr) {
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat:
-; GFX11-TRUE16:       ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] offset:2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    flat_load_d16_hi_b16 v0, v[1:2]
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat:
-; GFX11-FAKE16:       ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT:    flat_load_u16 v0, v[0:1] offset:2
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    flat_load_d16_hi_b16 v0, v[1:2]
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_flat:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_load_u16 v0, v[0:1] offset:2
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_d16_hi_b16 v0, v[1:2]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds half, ptr %ptr, i64 1
   %load_lo = load half, ptr %gep_lo
@@ -512,23 +462,14 @@ define <2 x half> @chain_hi_to_lo_flat_different_bases(ptr %base_lo, ptr %base_h
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_different_bases:
-; GFX11-TRUE16:       ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1]
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    flat_load_d16_hi_b16 v0, v[2:3]
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_different_bases:
-; GFX11-FAKE16:       ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    flat_load_u16 v0, v[0:1]
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    flat_load_d16_hi_b16 v0, v[2:3]
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_flat_different_bases:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_u16 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_d16_hi_b16 v0, v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %load_lo = load half, ptr %base_lo
   %load_hi = load half, ptr %base_hi
@@ -677,23 +618,25 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v2, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, off dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[0:1] offset:2
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v2, s[0:1] offset:2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, off offset:2 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[0:1] offset:4
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v2, s[0:1] offset:4
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, off offset:4 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v0, off, off offset:2
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, off
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v3, off, off offset:2
+; GFX11-TRUE16-NEXT:    scratch_load_u16 v0, off, off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v3.l
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v1, off, off offset:4
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
@@ -962,13 +905,14 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
 ; GFX11-TRUE16-LABEL: chain_hi_to_lo_global_other_dep:
 ; GFX11-TRUE16:       ; %bb.0: ; %bb
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v2, v[0:1], off offset:2 glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v[0:1], off offset:2 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v[0:1], off glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_other_dep:
@@ -1035,14 +979,15 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
 ; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_other_dep:
 ; GFX11-TRUE16:       ; %bb.0: ; %bb
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc
+; GFX11-TRUE16-NEXT:    flat_load_u16 v2, v[0:1] offset:2 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    flat_load_d16_hi_b16 v0, v[0:1] glc dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep:
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index ccdc0b1bf43c4..5d74fe3d3c470 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -490,7 +490,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_f16_e64 v0.l, v0.l, 1.0 clamp
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -573,7 +573,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_f16_e64 v0.l, v0.l, 1.0 clamp
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -1555,18 +1555,18 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
 ; GFX11-TRUE16-LABEL: v_no_clamp_add_src_v2f16_f16_src:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v1.l, 1.0, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0 clamp
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: v_no_clamp_add_src_v2f16_f16_src:
@@ -1969,7 +1969,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm_minimumnum_maximumnum(ptr
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_f16_e64 v0.l, v0.l, 1.0 clamp
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -2052,7 +2052,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals_minimumnum_maximumnu
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_f16_e64 v0.l, v0.l, 1.0 clamp
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 5eb6b2f58474d..04f2a5ed1a5fd 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -594,7 +594,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, v0.l, v0.l clamp
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -702,7 +702,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -v0.l, -v0.l clamp
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -811,7 +811,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -|v0.l|, -|v0.l| clamp
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 5fb50d0d89530..d896082e858c0 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=kaveri < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-D16-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 declare half @llvm.fabs.f16(half) #0
@@ -57,6 +58,15 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_undef_value_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_fold_canonicalize_undef_value_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -111,12 +121,23 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_var_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_load_u16 v0, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_var_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -179,6 +200,17 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: s_test_canonicalize_var_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_clause 0x1
+; GFX11-D16-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    v_max_f16_e64 v0.l, s2, s2
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: s_test_canonicalize_var_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_clause 0x1
@@ -229,6 +261,14 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1
 ; GFX11-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_build_vector_v2f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-D16-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-D16-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-D16-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_build_vector_v2f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -284,12 +324,23 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, |v0.l|, |v0.l|
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_fabs_var_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    v_max_f16_e64 v0.l, |v0.l|, |v0.l|
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_fabs_var_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -349,12 +400,23 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -|v0.l|, -|v0.l|
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    v_max_f16_e64 v0.l, -|v0.l|, -|v0.l|
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -415,12 +477,23 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -v0.l, -v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_fneg_var_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_fneg_var_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -480,12 +553,23 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -v0.l, -v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -545,12 +629,23 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -|v0.l|, -|v0.l|
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    v_max_f16_e64 v0.l, -|v0.l|, -|v0.l|
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -607,6 +702,15 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out)
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_p0_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_fold_canonicalize_p0_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -658,6 +762,15 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out)
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_n0_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x8000
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_fold_canonicalize_n0_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -709,6 +822,15 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out)
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_p1_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3c00
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_fold_canonicalize_p1_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -760,6 +882,15 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out)
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_n1_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0xbc00
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_fold_canonicalize_n1_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -811,6 +942,15 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_literal_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x4c00
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_fold_canonicalize_literal_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -862,6 +1002,15 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3ff
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -913,6 +1062,15 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3ff
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -964,6 +1122,15 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x83ff
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -1015,6 +1182,15 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x83ff
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -1066,6 +1242,15 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_qnan_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x7c00
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_fold_canonicalize_qnan_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -1117,6 +1302,15 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x7e00
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -1168,6 +1362,15 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x7e00
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -1219,6 +1422,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_snan0_value_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x7e00
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_fold_canonicalize_snan0_value_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -1270,6 +1482,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_snan1_value_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x7e00
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_fold_canonicalize_snan1_value_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -1321,6 +1542,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_snan2_value_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x7e00
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_fold_canonicalize_snan2_value_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -1372,6 +1602,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_snan3_value_f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x7e00
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-FAKE16-LABEL: test_fold_canonicalize_snan3_value_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -2572,6 +2811,14 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_reg_undef_v2f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-D16-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-D16-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 0
+; GFX11-D16-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_v2f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2611,6 +2858,12 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_undef_reg_v2f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.l, v0.l
+; GFX11-D16-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_undef_reg_v2f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2774,6 +3027,14 @@ define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 2.0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_reg_k_v2f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-D16-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-D16-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 2.0
+; GFX11-D16-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_k_v2f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2818,6 +3079,14 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, 2.0, v0.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_k_reg_v2f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-D16-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-D16-TRUE16-NEXT:    v_pack_b32_f16 v0, 2.0, v0.l
+; GFX11-D16-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_k_reg_v2f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2913,6 +3182,15 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; GFX11-D16-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-D16-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 0
+; GFX11-D16-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2965,6 +3243,15 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
 ; GFX11-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-D16-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; GFX11-D16-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-D16-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-D16-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3022,6 +3309,16 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
 ; GFX11-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
+; GFX11-D16-TRUE16:       ; %bb.0:
+; GFX11-D16-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-D16-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-D16-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-D16-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 0
+; GFX11-D16-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-D16-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index d32b528d13276..f9a3a55fe69fd 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -2410,7 +2410,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[2:3]
@@ -2792,7 +2792,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 210e09fd9169a..784363035e7de 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -137,33 +137,33 @@ define amdgpu_kernel void @v_fdiv_f16(
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v3, v1, s[4:5] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v0.l
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v2.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v4, v4, v0
 ; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v4, v6 op_sel_hi:[1,0,1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v4, v7, v0
 ; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v4, v6 op_sel_hi:[1,0,1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v0, v5, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: v_fdiv_f16:
@@ -293,7 +293,7 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -400,7 +400,7 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_rcp_f16_e64 v0.l, |v0.l|
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -510,7 +510,7 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -604,7 +604,7 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -711,7 +711,7 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_rcp_f16_e64 v0.l, -v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -821,7 +821,7 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_rsq_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -935,7 +935,7 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_rsq_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
@@ -1058,12 +1058,12 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_rsq_f16_e32 v0.h, v0.l
-; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
+; GFX11-TRUE16-NEXT:    v_rsq_f16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v2, s[0:1] dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: v_rsq_f16_multi_use:
@@ -1177,7 +1177,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_sqrt_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
@@ -1295,7 +1295,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_sqrt_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
@@ -1413,7 +1413,7 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r,
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_sqrt_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
@@ -1536,13 +1536,13 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1)
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[4:5] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v2.l, v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -1666,13 +1666,13 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[4:5] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v2.l, v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -1758,7 +1758,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 {
 ;
 ; GFX11-TRUE16-LABEL: div_afn_2_x_pat_f16:
 ; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
@@ -1834,7 +1834,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 {
 ;
 ; GFX11-TRUE16-LABEL: div_afn_k_x_pat_f16:
 ; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
@@ -1910,7 +1910,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 {
 ;
 ; GFX11-TRUE16-LABEL: div_afn_neg_k_x_pat_f16:
 ; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
index 91f9aa1c5fe3b..14e58a2637390 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -225,7 +225,7 @@ define amdgpu_kernel void @store_flat_i8_neg_offset(ptr %fptr, i8 %x) #0 {
 ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc{{$}}
 ; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
-; GFX11-TRUE16: flat_load_d16_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}}
+; GFX11-TRUE16: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}}
 ; GFX11-FAKE16: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}}
 define amdgpu_kernel void @load_flat_i8_max_offset(ptr %fptr) #0 {
   %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4095
@@ -237,7 +237,7 @@ define amdgpu_kernel void @load_flat_i8_max_offset(ptr %fptr) #0 {
 ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
-; GFX11-TRUE16: flat_load_d16_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
+; GFX11-TRUE16: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
 ; GFX11-FAKE16: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
 define amdgpu_kernel void @load_flat_i8_max_offset_p1(ptr %fptr) #0 {
   %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4096
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index fc8883924dfbc..1f8cee42686bd 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -4585,7 +4585,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 1
 ; GFX11-TRUE16-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, v0, off offset:-1 glc dlc
+; GFX11-TRUE16-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4678,7 +4678,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX11-PAL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 1
 ; GFX11-PAL-TRUE16-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
 ; GFX11-PAL-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-TRUE16-NEXT:    scratch_load_d16_u8 v0, v0, off offset:-1 glc dlc
+; GFX11-PAL-TRUE16-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
 ; GFX11-PAL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4758,7 +4758,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 1
 ; GFX11-TRUE16-NEXT:    scratch_store_b8 v1, v0, off offset:-129 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, v1, off offset:-129 glc dlc
+; GFX11-TRUE16-NEXT:    scratch_load_u8 v0, v1, off offset:-129 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4853,7 +4853,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX11-PAL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 1
 ; GFX11-PAL-TRUE16-NEXT:    scratch_store_b8 v1, v0, off offset:-129 dlc
 ; GFX11-PAL-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-TRUE16-NEXT:    scratch_load_d16_u8 v0, v1, off offset:-129 glc dlc
+; GFX11-PAL-TRUE16-NEXT:    scratch_load_u8 v0, v1, off offset:-129 glc dlc
 ; GFX11-PAL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index 57be2907da4a0..d20fa41837e3a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -7950,7 +7950,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    flat_load_u8 v0, v[0:1] offset:16 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -8026,7 +8026,7 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc
+; GFX11-TRUE16-NEXT:    flat_load_u8 v0, v[0:1] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -8119,7 +8119,7 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %
 ; GFX11-TRUE16-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    flat_load_u8 v0, v[0:1] offset:16 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -8407,7 +8407,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    flat_load_u16 v0, v[0:1] offset:16 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -8483,7 +8483,7 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] glc
+; GFX11-TRUE16-NEXT:    flat_load_u16 v0, v[0:1] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -8580,7 +8580,7 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64
 ; GFX11-TRUE16-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    flat_load_u16 v0, v[0:1] offset:16 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -10599,7 +10599,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    flat_load_u16 v0, v[0:1] offset:16 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -10674,7 +10674,7 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] glc
+; GFX11-TRUE16-NEXT:    flat_load_u16 v0, v[0:1] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -10752,7 +10752,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    flat_load_u16 v0, v[0:1] offset:16 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -10827,7 +10827,7 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v0, v[0:1] glc
+; GFX11-TRUE16-NEXT:    flat_load_u16 v0, v[0:1] glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll
index 5f86f2e48137d..de2e01f1cdf04 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll
@@ -29,13 +29,14 @@ define amdgpu_kernel void @fma_v2f16_divergent(
   ; GFX11-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec
   ; GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX11-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec
-  ; GFX11-NEXT:   [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE1]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.f.gep, addrspace 1)
-  ; GFX11-NEXT:   [[V_AND_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_AND_B16_t16_e64 0, 32767, 0, [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], 0, implicit $exec
-  ; GFX11-NEXT:   [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, -32768, 0, [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], 0, implicit $exec
+  ; GFX11-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR killed [[REG_SEQUENCE1]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.f.gep, addrspace 1)
+  ; GFX11-NEXT:   [[COPY10:%[0-9]+]]:vgpr_16 = COPY [[GLOBAL_LOAD_USHORT_SADDR]].lo16
+  ; GFX11-NEXT:   [[V_AND_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_AND_B16_t16_e64 0, 32767, 0, [[COPY10]], 0, implicit $exec
+  ; GFX11-NEXT:   [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, -32768, 0, [[COPY10]], 0, implicit $exec
   ; GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX11-NEXT:   S_CMP_LG_U32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_2]], implicit-def $scc
-  ; GFX11-NEXT:   [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
-  ; GFX11-NEXT:   [[V_CNDMASK_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CNDMASK_B16_t16_e64 0, killed [[V_XOR_B16_t16_e64_]], 0, killed [[V_AND_B16_t16_e64_]], killed [[COPY10]], 0, implicit $exec
+  ; GFX11-NEXT:   [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
+  ; GFX11-NEXT:   [[V_CNDMASK_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CNDMASK_B16_t16_e64 0, killed [[V_XOR_B16_t16_e64_]], 0, killed [[V_AND_B16_t16_e64_]], killed [[COPY11]], 0, implicit $exec
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GFX11-NEXT:   [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GFX11-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_CNDMASK_B16_t16_e64_]], %subreg.lo16, killed [[DEF]], %subreg.hi16
@@ -88,8 +89,8 @@ define amdgpu_kernel void @fma_v2f16_uniform(
   ; GFX11-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
   ; GFX11-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
   ; GFX11-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1
-  ; GFX11-NEXT:   [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16) from %ir.3, addrspace 1)
-  ; GFX11-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]]
+  ; GFX11-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR killed [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16) from %ir.3, addrspace 1)
+  ; GFX11-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[GLOBAL_LOAD_USHORT_SADDR]].lo16
   ; GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[COPY9]]
   ; GFX11-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE2]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.4, addrspace 1)
   ; GFX11-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE3]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.5, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index e59fbada6793d..066230e683c3e 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; FIXME-TRUE16 enable gisel
+; XUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define amdgpu_ps float @test_fmaximum_f32_vv(float %a, float %b) {
@@ -122,15 +123,14 @@ define amdgpu_ps half @test_fmaximum_f16_vv(half %a, half %b) {
 ; GFX12-SDAG-FAKE16-NEXT:    v_maximum_f16 v0, v0, v1
 ; GFX12-SDAG-FAKE16-NEXT:    ; return to shader part epilog
 ;
+; GFX12-GISEL-LABEL: test_fmaximum_f16_vv:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
 ; GFX12-GISEL-TRUE16-LABEL: test_fmaximum_f16_vv:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v1.l
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-FAKE16-LABEL: test_fmaximum_f16_vv:
-; GFX12-GISEL-FAKE16:       ; %bb.0:
-; GFX12-GISEL-FAKE16-NEXT:    v_maximum_f16 v0, v0, v1
-; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %val = call half @llvm.maximum.f16(half %a, half %b)
   ret half %val
 }
@@ -171,17 +171,16 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b
 ; GFX12-SDAG-NEXT:    v_pk_maximum_f16 v1, v1, v3
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX12-GISEL-LABEL: test_fmaximum_v3f16_vv:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_maximum_f16 v1, v1, v3
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
 ; GFX12-GISEL-TRUE16-LABEL: test_fmaximum_v3f16_vv:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    v_pk_maximum_f16 v0, v0, v2
 ; GFX12-GISEL-TRUE16-NEXT:    v_maximum_f16 v1.l, v1.l, v3.l
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-FAKE16-LABEL: test_fmaximum_v3f16_vv:
-; GFX12-GISEL-FAKE16:       ; %bb.0:
-; GFX12-GISEL-FAKE16-NEXT:    v_pk_maximum_f16 v0, v0, v2
-; GFX12-GISEL-FAKE16-NEXT:    v_maximum_f16 v1, v1, v3
-; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
   ret <3 x half> %val
 }
@@ -335,6 +334,20 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
 ; GFX12-SDAG-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
+; GFX12-GISEL-LABEL: fmaximum_f16_move_to_valu:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u16 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    v_maximum_f16 v1, v1, v2
+; GFX12-GISEL-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    s_endpgm
 ; GFX12-GISEL-TRUE16-LABEL: fmaximum_f16_move_to_valu:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    s_clause 0x1
@@ -349,21 +362,6 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
 ; GFX12-GISEL-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v0.h
 ; GFX12-GISEL-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
-;
-; GFX12-GISEL-FAKE16-LABEL: fmaximum_f16_move_to_valu:
-; GFX12-GISEL-FAKE16:       ; %bb.0:
-; GFX12-GISEL-FAKE16-NEXT:    s_clause 0x1
-; GFX12-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-GISEL-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] scope:SCOPE_SYS
-; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    v_maximum_f16 v1, v1, v2
-; GFX12-GISEL-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-GISEL-FAKE16-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 4
   %b = load volatile half, ptr addrspace(1) %bptr, align 4
   %v = call half @llvm.maximum.f16(half %a, half %b)
@@ -383,3 +381,5 @@ declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
 declare double @llvm.maximum.f64(double, double)
 declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
 declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX12-GISEL-FAKE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 9233f8059a202..76f6693199281 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -8,7 +8,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; FIXME-TRUE16 enable gisel
+; XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
 
 define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
 ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
@@ -7547,19 +7548,19 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
 ; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-FAKE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
-; GFX11-GISEL-FAKE16:       ; %bb.0:
-; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v1, 1.0, v1
-; GFX11-GISEL-FAKE16-NEXT:    v_med3_f16 v1, v1, 2.0, 4.0
-; GFX11-GISEL-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX11-GISEL-NEXT:    v_med3_f16 v1, v1, 2.0, 4.0
+; GFX11-GISEL-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-TRUE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
@@ -7568,13 +7569,12 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-SDAG-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_med3_f16 v0.l, v0.l, 2.0, 4.0
 ; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
 ; GFX11-GISEL-TRUE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -7788,26 +7788,26 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
 ; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-FAKE16-LABEL: v_nnan_inputs_med3_f16_pat0:
-; GFX11-GISEL-FAKE16:       ; %bb.0:
-; GFX11-GISEL-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] glc dlc
-; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v3, v0, s[6:7] glc dlc
-; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v1, 1.0, v1
-; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v2, 2.0, v2
-; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v3, 4.0, v3
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-FAKE16-NEXT:    v_med3_f16 v1, v1, v2, v3
-; GFX11-GISEL-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u16 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u16 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v2, 2.0, v2
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v3, 4.0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_f16 v1, v1, v2, v3
+; GFX11-GISEL-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-TRUE16-LABEL: v_nnan_inputs_med3_f16_pat0:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
@@ -7816,20 +7816,19 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    global_load_u16 v0, v2, s[2:3] glc dlc
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    global_load_u16 v1, v2, s[4:5] glc dlc
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[6:7] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    global_load_u16 v3, v2, s[6:7] glc dlc
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v1.l, 4.0, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.h, 2.0, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v1.l, 4.0, v3.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_med3_f16 v0.l, v0.l, v0.h, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
 ; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
 ; GFX11-GISEL-TRUE16-LABEL: v_nnan_inputs_med3_f16_pat0:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
@@ -8752,13 +8751,13 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 {
 ; GFX11-SDAG-FAKE16-NEXT:    v_med3_f16 v0, v0, 2.0, 4.0
 ; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-FAKE16-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum:
-; GFX11-GISEL-FAKE16:       ; %bb.0:
-; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-FAKE16-NEXT:    v_med3_f16 v0, v0, 2.0, 4.0
-; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_f16 v0, v0, 2.0, 4.0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
@@ -8767,7 +8766,6 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 {
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_med3_f16 v0.l, v0.l, 2.0, 4.0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-GISEL-TRUE16-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8917,3 +8915,5 @@ declare half @llvm.maxnum.f16(half, half) #0
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
 attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-GISEL-FAKE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index b25120f2ece6f..e6936e4bdff6c 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; FIXME-TRUE16 enable gisel
+; XUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define amdgpu_ps float @test_fminimum_f32_vv(float %a, float %b) {
@@ -122,15 +123,14 @@ define amdgpu_ps half @test_fminimum_f16_vv(half %a, half %b) {
 ; GFX12-SDAG-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
 ; GFX12-SDAG-FAKE16-NEXT:    ; return to shader part epilog
 ;
+; GFX12-GISEL-LABEL: test_fminimum_f16_vv:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
 ; GFX12-GISEL-TRUE16-LABEL: test_fminimum_f16_vv:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v1.l
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-FAKE16-LABEL: test_fminimum_f16_vv:
-; GFX12-GISEL-FAKE16:       ; %bb.0:
-; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
-; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %val = call half @llvm.minimum.f16(half %a, half %b)
   ret half %val
 }
@@ -171,17 +171,16 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b
 ; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v1, v1, v3
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX12-GISEL-LABEL: test_fminimum_v3f16_vv:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_minimum_f16 v1, v1, v3
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
 ; GFX12-GISEL-TRUE16-LABEL: test_fminimum_v3f16_vv:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    v_pk_minimum_f16 v0, v0, v2
 ; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v1.l, v1.l, v3.l
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-FAKE16-LABEL: test_fminimum_v3f16_vv:
-; GFX12-GISEL-FAKE16:       ; %bb.0:
-; GFX12-GISEL-FAKE16-NEXT:    v_pk_minimum_f16 v0, v0, v2
-; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v1, v1, v3
-; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
   ret <3 x half> %val
 }
@@ -335,6 +334,20 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
 ; GFX12-SDAG-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
+; GFX12-GISEL-LABEL: fminimum_f16_move_to_valu:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u16 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    v_minimum_f16 v1, v1, v2
+; GFX12-GISEL-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    s_endpgm
 ; GFX12-GISEL-TRUE16-LABEL: fminimum_f16_move_to_valu:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    s_clause 0x1
@@ -349,21 +362,6 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
 ; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
 ; GFX12-GISEL-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
-;
-; GFX12-GISEL-FAKE16-LABEL: fminimum_f16_move_to_valu:
-; GFX12-GISEL-FAKE16:       ; %bb.0:
-; GFX12-GISEL-FAKE16-NEXT:    s_clause 0x1
-; GFX12-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-GISEL-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] scope:SCOPE_SYS
-; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v1, v1, v2
-; GFX12-GISEL-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-GISEL-FAKE16-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 4
   %b = load volatile half, ptr addrspace(1) %bptr, align 4
   %v = call half @llvm.minimum.f16(half %a, half %b)
@@ -383,3 +381,5 @@ declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
 declare double @llvm.minimum.f64(double, double)
 declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
 declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX12-GISEL-FAKE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 51b6d17312ed7..d258329128994 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -97,18 +97,18 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-FLUSH-TRUE16-LABEL: fmuladd_f16:
 ; GFX11-FLUSH-TRUE16:       ; %bb.0:
 ; GFX11-FLUSH-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-FLUSH-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-FLUSH-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    s_clause 0x2
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[4:5]
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v3, v1, s[6:7]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
-; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-FAKE16-LABEL: fmuladd_f16:
@@ -131,15 +131,15 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-DENORM-STRICT-TRUE16-LABEL: fmuladd_f16:
 ; GFX11-DENORM-STRICT-TRUE16:       ; %bb.0:
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_clause 0x2
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v0, v1, s[6:7]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v3, v1, s[4:5]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fmac_f16_e32 v1.l, v0.l, v0.h
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v2.l, v3.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_f16:
@@ -159,15 +159,15 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fmuladd_f16:
 ; GFX11-DENORM-CONTRACT-TRUE16:       ; %bb.0:
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_clause 0x2
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v0, v1, s[6:7]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v3, v1, s[4:5]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v1.l, v0.l, v0.h
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v2.l, v3.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_f16:
@@ -282,18 +282,18 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-FLUSH-TRUE16-LABEL: fmul_fadd_f16:
 ; GFX11-FLUSH-TRUE16:       ; %bb.0:
 ; GFX11-FLUSH-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-FLUSH-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-FLUSH-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    s_clause 0x2
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[4:5]
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v3, v1, s[6:7]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
-; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-FAKE16-LABEL: fmul_fadd_f16:
@@ -316,18 +316,18 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-DENORM-STRICT-TRUE16-LABEL: fmul_fadd_f16:
 ; GFX11-DENORM-STRICT-TRUE16:       ; %bb.0:
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_clause 0x2
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v1, s[4:5]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v3, v1, s[6:7]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-FAKE16-LABEL: fmul_fadd_f16:
@@ -350,15 +350,15 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fmul_fadd_f16:
 ; GFX11-DENORM-CONTRACT-TRUE16:       ; %bb.0:
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_clause 0x2
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v0, v1, s[6:7]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v3, v1, s[4:5]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v1.l, v0.l, v0.h
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v2.l, v3.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmul_fadd_f16:
@@ -458,18 +458,18 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
 ; GFX11-FLUSH-TRUE16-LABEL: fmul_fadd_contract_f16:
 ; GFX11-FLUSH-TRUE16:       ; %bb.0:
 ; GFX11-FLUSH-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-FLUSH-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-FLUSH-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    s_clause 0x2
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[4:5]
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v3, v1, s[6:7]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
-; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-FAKE16-LABEL: fmul_fadd_contract_f16:
@@ -492,15 +492,15 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
 ; GFX11-DENORM-STRICT-TRUE16-LABEL: fmul_fadd_contract_f16:
 ; GFX11-DENORM-STRICT-TRUE16:       ; %bb.0:
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_clause 0x2
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v0, v1, s[6:7]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v3, v1, s[4:5]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fmac_f16_e32 v1.l, v0.l, v0.h
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v2.l, v3.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-FAKE16-LABEL: fmul_fadd_contract_f16:
@@ -520,15 +520,15 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
 ; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fmul_fadd_contract_f16:
 ; GFX11-DENORM-CONTRACT-TRUE16:       ; %bb.0:
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_clause 0x2
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v0, v1, s[6:7]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v3, v1, s[4:5]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v1.l, v0.l, v0.h
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v2.l, v3.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmul_fadd_contract_f16:
@@ -625,13 +625,13 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
@@ -657,14 +657,14 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v0, v2, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_2.0_a_b_f16:
@@ -687,14 +687,14 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v0, v2, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_2.0_a_b_f16:
@@ -795,13 +795,13 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
@@ -827,14 +827,14 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v0, v2, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_a_2.0_b_f16:
@@ -857,14 +857,14 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v0, v2, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_a_2.0_b_f16:
@@ -979,13 +979,13 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
@@ -1013,13 +1013,13 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
@@ -1045,14 +1045,14 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v0, v2, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fadd_a_a_b_f16:
@@ -1170,13 +1170,13 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v2.l, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
@@ -1204,13 +1204,13 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_add_f16_e32 v0.l, v2.l, v0.l
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
@@ -1236,14 +1236,14 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v0, v2, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fadd_b_a_a_f16:
@@ -1347,13 +1347,13 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v2.l, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
@@ -1379,14 +1379,14 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fmac_f16_e32 v0.h, -2.0, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, -2.0, v1.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v0, v2, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_neg_2.0_a_b_f16:
@@ -1409,14 +1409,14 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v0.h, -2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, -2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v0, v2, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_neg_2.0_a_b_f16:
@@ -1517,13 +1517,13 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v2.l, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
@@ -1549,14 +1549,14 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v0, v2, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
@@ -1579,14 +1579,14 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v0, v2, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
@@ -1689,13 +1689,13 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v2.l, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
@@ -1721,14 +1721,14 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fmac_f16_e32 v0.h, -2.0, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, -2.0, v1.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v0, v2, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_2.0_neg_a_b_f16:
@@ -1751,14 +1751,14 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v0.h, -2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, -2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v0, v2, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_2.0_neg_a_b_f16:
@@ -1861,13 +1861,13 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
@@ -1895,11 +1895,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, 2.0, -v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, 2.0, -v2.l
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
@@ -1925,11 +1925,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, 2.0, -v0.h
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, 2.0, -v2.l
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
@@ -2065,18 +2065,18 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ; GFX11-FLUSH-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-FLUSH-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v1.l
-; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v3.l
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-FAKE16-LABEL: mad_sub_f16:
@@ -2103,18 +2103,18 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v1.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v3.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-FAKE16-LABEL: mad_sub_f16:
@@ -2141,16 +2141,16 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, v0.h, -v1.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, v2.l, -v3.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: mad_sub_f16:
@@ -2289,18 +2289,18 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ; GFX11-FLUSH-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-FLUSH-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v1.l, v0.l
-; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v3.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-FAKE16-LABEL: mad_sub_inv_f16:
@@ -2327,18 +2327,18 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v1.l, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v3.l, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-FAKE16-LABEL: mad_sub_inv_f16:
@@ -2365,16 +2365,16 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.l, v0.h, v1.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.l, v2.l, v3.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: mad_sub_inv_f16:
@@ -2513,18 +2513,18 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ; GFX11-FLUSH-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-FLUSH-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e64 v0.l, v0.l, |v1.l|
-; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e64 v0.l, v0.l, |v3.l|
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-FAKE16-LABEL: mad_sub_fabs_f16:
@@ -2551,18 +2551,18 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_sub_f16_e64 v0.l, v0.l, |v1.l|
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_sub_f16_e64 v0.l, v0.l, |v3.l|
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-FAKE16-LABEL: mad_sub_fabs_f16:
@@ -2589,16 +2589,16 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, v0.h, -|v1.l|
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, v2.l, -|v3.l|
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: mad_sub_fabs_f16:
@@ -2738,18 +2738,18 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ; GFX11-FLUSH-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-FLUSH-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e64 v0.l, |v1.l|, v0.l
-; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e64 v0.l, |v3.l|, v0.l
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-FAKE16-LABEL: mad_sub_fabs_inv_f16:
@@ -2776,18 +2776,18 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_sub_f16_e64 v0.l, |v1.l|, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_sub_f16_e64 v0.l, |v3.l|, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-FAKE16-LABEL: mad_sub_fabs_inv_f16:
@@ -2814,16 +2814,16 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.l, v0.h, |v1.l|
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.l, v2.l, |v3.l|
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: mad_sub_fabs_inv_f16:
@@ -2963,18 +2963,18 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ; GFX11-FLUSH-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-FLUSH-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v1.l, v0.l
-; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v3.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-FAKE16-LABEL: neg_neg_mad_f16:
@@ -3001,18 +3001,18 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_add_f16_e32 v0.l, v1.l, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_add_f16_e32 v0.l, v3.l, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-FAKE16-LABEL: neg_neg_mad_f16:
@@ -3039,16 +3039,16 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v1.l, v0.l, v0.h
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v3.l, v1.l, v2.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v0, v3, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: neg_neg_mad_f16:
@@ -3189,18 +3189,18 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ; GFX11-FLUSH-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-FLUSH-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e64 v0.l, v0.l, |v0.h|
+; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e64 v0.l, v0.l, |v2.l|
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v1.l
-; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v3.l
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FLUSH-FAKE16-LABEL: mad_fabs_sub_f16:
@@ -3227,18 +3227,18 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mul_f16_e64 v0.l, v0.l, |v0.h|
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_mul_f16_e64 v0.l, v0.l, |v2.l|
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v1.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v3.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-STRICT-FAKE16-LABEL: mad_fabs_sub_f16:
@@ -3265,16 +3265,16 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, |v0.h|, -v1.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, |v2.l|, -v3.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: mad_fabs_sub_f16:
@@ -3396,13 +3396,13 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v2.l, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
@@ -3430,13 +3430,13 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.h, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v2.l, v0.l
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
@@ -3462,14 +3462,14 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v0.h, -2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, -2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v0, v2, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fsub_c_fadd_a_a_f16:
@@ -3586,13 +3586,13 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
@@ -3620,13 +3620,13 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-STRICT-TRUE16-NEXT:    s_endpgm
 ;
@@ -3654,11 +3654,11 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, 2.0, -v0.h
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, 2.0, -v2.l
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-DENORM-CONTRACT-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
index 64a9727330cfd..a3fa2f46538d2 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
@@ -399,7 +399,7 @@ define amdgpu_kernel void @v_fneg_fabs_bf16(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, 0x8000, v0
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 9d9a851a5507e..46212d8312d90 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -348,7 +348,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, 0x8000, v0
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
index d232693b46ad9..eada9d55a75b2 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
@@ -134,7 +134,7 @@ define amdgpu_kernel void @v_fneg_bf16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -303,51 +303,28 @@ define amdgpu_kernel void @v_fneg_fold_bf16(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    global_store_short_d16_hi v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-TRUE16-LABEL: v_fneg_fold_bf16:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v1, v0, s[2:3]
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v2, 0x8000, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v1, v2, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_d16_hi_b16 v0, v1, s[0:1]
-; GFX11-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-FAKE16-LABEL: v_fneg_fold_bf16:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v2, 0x8000, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v1, v2, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT:    global_store_d16_hi_b16 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: v_fneg_fold_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-NEXT:    global_store_d16_hi_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
   %val = load bfloat, ptr addrspace(1) %in
   %fsub = fsub bfloat -0.0, %val
   %fmul = fmul bfloat %fsub, %val
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index cab27fca5ab0a..b0213dd33ee36 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -132,7 +132,7 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -285,7 +285,7 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.l, -v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index f2fe61f5376e4..9ff5b19d711e9 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -306,7 +306,7 @@ ret:
 
 ; GFX11-TRUE16-LABEL: tied_operand_test:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
-; GFX11-TRUE16:     scratch_load_d16_b16 [[LDRESULT:v[0-9]+]], off, off
+; GFX11-TRUE16:     scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off
 ; GFX11-TRUE16:     v_mov_b16_e32 [[C:v[0-9]]].{{(l|h)}}, 0x7b
 ; GFX11-TRUE16-DAG:     ds_store_b16 v{{[0-9]+}}, [[LDRESULT]]  offset:10
 ; GFX11-TRUE16-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
index 308e86bbaf8fd..350d93e3e55e8 100644
--- a/llvm/test/CodeGen/AMDGPU/freeze.ll
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -5563,29 +5563,13 @@ define void @freeze_i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-NEXT:    global_store_short v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: freeze_i16:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: freeze_i16:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: freeze_i16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: freeze_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %a = load i16, ptr addrspace(1) %ptra
   %freeze = freeze i16 %a
   store i16 %freeze, ptr addrspace(1) %ptrb
@@ -6214,29 +6198,13 @@ define void @freeze_f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-NEXT:    global_store_short v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: freeze_f16:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: freeze_f16:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: freeze_f16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: freeze_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %a = load half, ptr addrspace(1) %ptra
   %freeze = freeze half %a
   store half %freeze, ptr addrspace(1) %ptrb
@@ -6871,29 +6839,13 @@ define void @freeze_bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-NEXT:    global_store_short v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: freeze_bf16:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: freeze_bf16:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: freeze_bf16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: freeze_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %a = load bfloat, ptr addrspace(1) %ptra
   %freeze = freeze bfloat %a
   store bfloat %freeze, ptr addrspace(1) %ptrb
@@ -12151,29 +12103,13 @@ define void @freeze_i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-NEXT:    global_store_byte v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: freeze_i8:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[2:3], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: freeze_i8:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[2:3], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: freeze_i8:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[2:3], v0, off
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: freeze_i8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %a = load i8, ptr addrspace(1) %ptra
   %freeze = freeze i8 %a
   store i8 %freeze, ptr addrspace(1) %ptrb
@@ -12287,21 +12223,13 @@ define void @freeze_v2i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-GISEL-NEXT:    global_store_short v[2:3], v0, off
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: freeze_v2i8:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: freeze_v2i8:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: freeze_v2i8:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: freeze_v2i8:
 ; GFX11-GISEL:       ; %bb.0:
@@ -13451,7 +13379,7 @@ define void @freeze_v2i1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX11-SDAG-TRUE16-LABEL: freeze_v2i1:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 3
 ; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[2:3], v0, off
@@ -13626,7 +13554,7 @@ define void @freeze_v3i1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX11-SDAG-TRUE16-LABEL: freeze_v3i1:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 7
 ; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[2:3], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 20009aee6e7ff..b4098fb96fed6 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -525,23 +525,23 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v1, v0, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[4:5] offset:8
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[4:5] offset:8
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v3, |v1.l|
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v3, |v0.l|
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v2, |v0.l|
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v2, |v1.l|
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v3, v2
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB0_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %frem.else
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0x7fff, 0, v1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0x7fff, 0, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v3, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.l, v4.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX11-TRUE16-NEXT:    s_branch .LBB0_8
 ; GFX11-TRUE16-NEXT:  .LBB0_2:
@@ -622,10 +622,10 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-TRUE16-NEXT:    v_ldexp_f32 v2, v3, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v2
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0x7fff, v2, v1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0x7fff, v2, v0
 ; GFX11-TRUE16-NEXT:  .LBB0_8: ; %Flow19
-; GFX11-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, 0x7c00, |v1.l|
+; GFX11-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, 0x7c00, |v0.l|
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -748,122 +748,124 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1150-TRUE16-LABEL: frem_f16:
 ; GFX1150-TRUE16:       ; %bb.0:
 ; GFX1150-TRUE16-NEXT:    s_clause 0x1
-; GFX1150-TRUE16-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
-; GFX1150-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1150-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1150-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-TRUE16-NEXT:    s_clause 0x1
-; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[10:11]
-; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v1, v1, s[0:1] offset:8
+; GFX1150-TRUE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX1150-TRUE16-NEXT:    global_load_u16 v0, v0, s[6:7] offset:8
 ; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1150-TRUE16-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-TRUE16-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX1150-TRUE16-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX1150-TRUE16-NEXT:    s_and_b32 s2, s1, 0x7fff
-; GFX1150-TRUE16-NEXT:    s_cvt_f32_f16 s1, s0
-; GFX1150-TRUE16-NEXT:    s_cvt_f32_f16 s0, s2
+; GFX1150-TRUE16-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1150-TRUE16-NEXT:    s_and_b32 s2, s4, 0x7fff
+; GFX1150-TRUE16-NEXT:    s_and_b32 s5, s3, 0x7fff
+; GFX1150-TRUE16-NEXT:    s_cvt_f32_f16 s6, s2
+; GFX1150-TRUE16-NEXT:    s_cvt_f32_f16 s5, s5
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GFX1150-TRUE16-NEXT:    s_cmp_ngt_f32 s1, s0
+; GFX1150-TRUE16-NEXT:    s_cmp_ngt_f32 s6, s5
 ; GFX1150-TRUE16-NEXT:    s_cbranch_scc0 .LBB0_2
 ; GFX1150-TRUE16-NEXT:  ; %bb.1: ; %frem.else
-; GFX1150-TRUE16-NEXT:    s_cmp_eq_f32 s1, s0
-; GFX1150-TRUE16-NEXT:    v_bfi_b32 v2, 0x7fff, 0, v0
-; GFX1150-TRUE16-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1150-TRUE16-NEXT:    s_cmp_eq_f32 s6, s5
+; GFX1150-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, 0, s4
+; GFX1150-TRUE16-NEXT:    s_cselect_b32 s8, -1, 0
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.l, v2.l, s3
+; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v0.l, s4, v0.l, s8
 ; GFX1150-TRUE16-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX1150-TRUE16-NEXT:    s_branch .LBB0_8
 ; GFX1150-TRUE16-NEXT:  .LBB0_2:
-; GFX1150-TRUE16-NEXT:    ; implicit-def: $vgpr2
+; GFX1150-TRUE16-NEXT:    ; implicit-def: $vgpr0
 ; GFX1150-TRUE16-NEXT:  .LBB0_3: ; %frem.compute
-; GFX1150-TRUE16-NEXT:    v_frexp_mant_f32_e32 v3, s0
-; GFX1150-TRUE16-NEXT:    v_frexp_mant_f32_e32 v2, s1
-; GFX1150-TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v5, s1
+; GFX1150-TRUE16-NEXT:    v_frexp_mant_f32_e32 v1, s5
+; GFX1150-TRUE16-NEXT:    v_frexp_mant_f32_e32 v0, s6
+; GFX1150-TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v3, s6
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1150-TRUE16-NEXT:    v_ldexp_f32 v3, v3, 1
-; GFX1150-TRUE16-NEXT:    v_ldexp_f32 v4, v2, 11
-; GFX1150-TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v2, s0
+; GFX1150-TRUE16-NEXT:    v_ldexp_f32 v1, v1, 1
+; GFX1150-TRUE16-NEXT:    v_ldexp_f32 v2, v0, 11
+; GFX1150-TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v0, s5
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1150-TRUE16-NEXT:    v_readfirstlane_b32 s1, v5
-; GFX1150-TRUE16-NEXT:    v_div_scale_f32 v7, null, v3, v3, 1.0
+; GFX1150-TRUE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1150-TRUE16-NEXT:    v_div_scale_f32 v5, null, v1, v1, 1.0
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1150-TRUE16-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX1150-TRUE16-NEXT:    v_add_nc_u32_e32 v2, -1, v2
-; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v8, v7
+; GFX1150-TRUE16-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX1150-TRUE16-NEXT:    v_add_nc_u32_e32 v0, -1, v0
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v6, v5
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_not_b32_e32 v6, v2
-; GFX1150-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v5
-; GFX1150-TRUE16-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0
+; GFX1150-TRUE16-NEXT:    v_not_b32_e32 v4, v0
+; GFX1150-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v3
+; GFX1150-TRUE16-NEXT:    v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0
 ; GFX1150-TRUE16-NEXT:    s_denorm_mode 15
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
-; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v8, v9, v8
+; GFX1150-TRUE16-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v6, v7, v6
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v9, v5, v8
-; GFX1150-TRUE16-NEXT:    v_fma_f32 v10, -v7, v9, v5
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v7, v3, v6
+; GFX1150-TRUE16-NEXT:    v_fma_f32 v8, -v5, v7, v3
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v9, v10, v8
-; GFX1150-TRUE16-NEXT:    v_fma_f32 v5, -v7, v9, v5
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v7, v8, v6
+; GFX1150-TRUE16-NEXT:    v_fma_f32 v3, -v5, v7, v3
 ; GFX1150-TRUE16-NEXT:    s_denorm_mode 12
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1150-TRUE16-NEXT:    v_div_fmas_f32 v5, v5, v8, v9
-; GFX1150-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v6
-; GFX1150-TRUE16-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; GFX1150-TRUE16-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
+; GFX1150-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v4
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
 ; GFX1150-TRUE16-NEXT:    s_cbranch_vccnz .LBB0_7
 ; GFX1150-TRUE16-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
-; GFX1150-TRUE16-NEXT:    s_sub_i32 s0, s1, s0
+; GFX1150-TRUE16-NEXT:    s_sub_i32 s5, s6, s5
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1150-TRUE16-NEXT:    s_add_i32 s0, s0, 11
+; GFX1150-TRUE16-NEXT:    s_add_i32 s5, s5, 11
 ; GFX1150-TRUE16-NEXT:  .LBB0_5: ; %frem.loop_body
 ; GFX1150-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v7, v4
-; GFX1150-TRUE16-NEXT:    s_add_i32 s0, s0, -11
-; GFX1150-TRUE16-NEXT:    s_cmp_gt_i32 s0, 11
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1150-TRUE16-NEXT:    s_add_i32 s5, s5, -11
+; GFX1150-TRUE16-NEXT:    s_cmp_gt_i32 s5, 11
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v4, v7, v5
-; GFX1150-TRUE16-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v2, v5, v3
+; GFX1150-TRUE16-NEXT:    v_rndne_f32_e32 v2, v2
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
-; GFX1150-TRUE16-NEXT:    v_fma_f32 v4, v4, v3, v7
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX1150-TRUE16-NEXT:    v_fma_f32 v2, v2, v1, v5
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
-; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v6, v4, v3
-; GFX1150-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX1150-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v2
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v4, v2, v1
+; GFX1150-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_ldexp_f32 v4, v4, 11
+; GFX1150-TRUE16-NEXT:    v_ldexp_f32 v2, v2, 11
 ; GFX1150-TRUE16-NEXT:    s_cbranch_scc1 .LBB0_5
 ; GFX1150-TRUE16-NEXT:  ; %bb.6: ; %Flow
-; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v6, s0
-; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v4, v7
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v4, s5
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX1150-TRUE16-NEXT:  .LBB0_7: ; %frem.loop_exit
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_add_nc_u32_e32 v6, -10, v6
-; GFX1150-TRUE16-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX1150-TRUE16-NEXT:    v_add_nc_u32_e32 v4, -10, v4
+; GFX1150-TRUE16-NEXT:    v_ldexp_f32 v2, v2, v4
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v5, v4, v5
-; GFX1150-TRUE16-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX1150-TRUE16-NEXT:    v_rndne_f32_e32 v3, v3
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
-; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v4, v5, v3
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v2, v3, v1
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
-; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1150-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX1150-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v2
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1150-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_ldexp_f32 v2, v3, v2
-; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v2
+; GFX1150-TRUE16-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_bfi_b32 v2, 0x7fff, v2, v0
+; GFX1150-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, s4
 ; GFX1150-TRUE16-NEXT:  .LBB0_8: ; %Flow19
-; GFX1150-TRUE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x7fff, v0
-; GFX1150-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0, v1.l
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1150-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, 0x7c00, v0.l
-; GFX1150-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v2.l, s0
-; GFX1150-TRUE16-NEXT:    global_store_b16 v3, v0, s[8:9]
+; GFX1150-TRUE16-NEXT:    s_cmp_lg_f16 s3, 0
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1150-TRUE16-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1150-TRUE16-NEXT:    s_cmp_nge_f16 s2, 0x7c00
+; GFX1150-TRUE16-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-TRUE16-NEXT:    s_and_b32 s2, s2, s3
+; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
+; GFX1150-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX1150-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1150-FAKE16-LABEL: frem_f16:
@@ -1424,37 +1426,35 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v3, v1, s[4:5] offset:8
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v3, v7, v4
-; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v0, v5, v4
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fast_frem_f16:
@@ -1501,35 +1501,34 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1150-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-TRUE16-NEXT:    s_clause 0x1
-; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT:    global_load_u16 v3, v2, s[2:3]
+; GFX1150-TRUE16-NEXT:    global_load_u16 v4, v2, s[4:5] offset:8
 ; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v3.l
 ; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v4.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v4, v0, v3 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v5, v1
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v4, v0, v3 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v1, v5, v1
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
-; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v4.l, v1.l
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v3.l, v0.h
-; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1150-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v3.l, v0.l, v4.l
+; GFX1150-TRUE16-NEXT:    global_store_b16 v2, v3, s[0:1]
 ; GFX1150-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1150-FAKE16-LABEL: fast_frem_f16:
@@ -1831,37 +1830,35 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v3, v1, s[4:5] offset:8
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v3, v7, v4
-; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v0, v5, v4
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: unsafe_frem_f16:
@@ -1908,35 +1905,34 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1150-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-TRUE16-NEXT:    s_clause 0x1
-; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT:    global_load_u16 v3, v2, s[2:3]
+; GFX1150-TRUE16-NEXT:    global_load_u16 v4, v2, s[4:5] offset:8
 ; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v3.l
 ; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v4.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v4, v0, v3 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v5, v1
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v4, v0, v3 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v1, v5, v1
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
-; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v4.l, v1.l
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v3.l, v0.h
-; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1150-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v3.l, v0.l, v4.l
+; GFX1150-TRUE16-NEXT:    global_store_b16 v2, v3, s[0:1]
 ; GFX1150-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1150-FAKE16-LABEL: unsafe_frem_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 3c41cc43a089e..5f083fcd2b6d0 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -1648,81 +1648,86 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
 ; GFX11-TRUE16-LABEL: void_func_v32i8:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u8 v31, off, s32
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, 0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v15.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v24, v32
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v3.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v11.h, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v20, v32
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v9.h, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v13.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v32.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v9.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v32.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v0.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v16, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v3.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v5.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v7.h, v6.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v12, v32
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v5.h, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v4.l, v5.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v32
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v6.l, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v29.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v10, v32
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v0.h, v8.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v32.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v27.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v25.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v6.h, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v11, v32
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v32.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v23.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v21.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v7.h, v6.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v32.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v16.l
-; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 16
-; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v6.h, v5.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v4.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.h
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v31.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v4.h, v7.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v32
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v5.l, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v19.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v17.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v14, v32
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v4.h, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v8.h, v5.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v32
-; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v4.h, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v13.h, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v17, v32
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v8.l, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v15, v32
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v10.l, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v32.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v18, v32
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v10.h, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v8.h, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v19, v32
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v4.l, v4.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v9, v32
 ; GFX11-TRUE16-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
-; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 16
 ; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2669,33 +2674,19 @@ define void @void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: void_func_byval_struct_i8_i32:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[0:3], 0
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: void_func_byval_struct_i8_i32:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v1, off, s32
-; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v1, off, s[0:3], 0
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: void_func_byval_struct_i8_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_u8 v1, off, s32
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b8 v1, off, s[0:3], 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %arg0.load = load { i8, i32 }, ptr addrspace(5) %arg0
   store { i8, i32 } %arg0.load, ptr addrspace(1) poison
   ret void
@@ -2779,55 +2770,30 @@ define void @void_func_byval_struct_i8_i32_x2(ptr addrspace(5) byval({ i8, i32 }
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: void_func_byval_struct_i8_i32_x2:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v1, off, s32 glc dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v2, off, s32 offset:8 glc dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v4, off, s32 offset:12 glc dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v3, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v1, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v4, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v2, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: void_func_byval_struct_i8_i32_x2:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v1, off, s32 glc dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:4 glc dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v3, off, s32 offset:8 glc dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v4, off, s32 offset:12 glc dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-FAKE16-NEXT:    buffer_store_b32 v2, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v1, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b32 v4, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v3, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v0
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: void_func_byval_struct_i8_i32_x2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_u8 v1, off, s32 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:4 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_load_u8 v3, off, s32 offset:8 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v4, off, s32 offset:12 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v2, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v1, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b32 v4, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %arg0.load = load volatile { i8, i32 }, ptr addrspace(5) %arg0
   %arg1.load = load volatile { i8, i32 }, ptr addrspace(5) %arg1
   store volatile { i8, i32 } %arg0.load, ptr addrspace(1) poison
@@ -3032,99 +2998,52 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: void_func_v32i32_i1_i8_i16_bf16:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x5
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_u8 v36, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v36
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v33, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v34, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v35, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: void_func_v32i32_i1_i8_i16_bf16:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x5
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v34, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v35, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u16 v36, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 1, v32
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v16, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-FAKE16-NEXT:    buffer_store_b16 v34, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    buffer_store_b16 v35, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    buffer_store_b16 v36, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: void_func_v32i32_i1_i8_i16_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_u8 v32, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_u16 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u16 v36, off, s32 offset:20
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_and_b32_e32 v16, 1, v32
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v16, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    buffer_store_b16 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    buffer_store_b16 v35, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b16 v36, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) poison
   store volatile i1 %arg1, ptr addrspace(1) poison
   store volatile i8 %arg2, ptr addrspace(1) poison
@@ -4536,185 +4455,95 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: void_func_v32i32_v16i8:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x10
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v32, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v33, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v34, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v35, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v36, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v37, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v38, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v39, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v48, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v49, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v50, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v51, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v52, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v53, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v54, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v55, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v34, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v35, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v36, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v37, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v38, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v39, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v48, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v49, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v50, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v51, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v52, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v53, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v54, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v55, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: void_func_v32i32_v16i8:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x10
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v32, off, s32 offset:64
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v33, off, s32 offset:60
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v34, off, s32 offset:56
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v35, off, s32 offset:52
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v36, off, s32 offset:48
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v37, off, s32 offset:44
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v38, off, s32 offset:40
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v39, off, s32 offset:36
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v48, off, s32 offset:32
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v49, off, s32 offset:28
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v50, off, s32 offset:24
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v51, off, s32 offset:20
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v52, off, s32 offset:16
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v53, off, s32 offset:12
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v54, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_u8 v55, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v34, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v35, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v36, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v37, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v38, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v39, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v48, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v49, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v50, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v51, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v52, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v53, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v54, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    buffer_store_b8 v55, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: void_func_v32i32_v16i8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x10
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_u8 v32, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_u8 v33, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u8 v34, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u8 v35, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_u8 v36, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u8 v37, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u8 v38, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u8 v39, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u8 v48, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u8 v49, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u8 v50, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u8 v51, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u8 v52, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u8 v53, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u8 v54, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u8 v55, off, s32 offset:4
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    buffer_store_b8 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    buffer_store_b8 v35, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    buffer_store_b8 v36, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    buffer_store_b8 v37, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    buffer_store_b8 v38, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    buffer_store_b8 v39, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    buffer_store_b8 v48, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    buffer_store_b8 v49, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    buffer_store_b8 v50, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    buffer_store_b8 v51, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    buffer_store_b8 v52, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    buffer_store_b8 v53, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    buffer_store_b8 v54, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b8 v55, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) poison
   store volatile <16 x i8> %arg1, ptr addrspace(1) poison
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index f67ab18dd8ef1..5883e807964e4 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -706,63 +706,34 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: test_call_external_void_func_i8_signext:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
-; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
-; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT:    global_load_d16_i8 v0, v[0:1], off glc dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_i8_signext at abs32@hi
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_i8_signext at abs32@lo
-; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: test_call_external_void_func_i8_signext:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
-; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
-; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT:    global_load_i8 v0, v[0:1], off glc dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_i8_signext at abs32@hi
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_i8_signext at abs32@lo
-; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_call_external_void_func_i8_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    global_load_i8 v0, v[0:1], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i8_signext at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i8_signext at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_signext:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -857,63 +828,34 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: test_call_external_void_func_i8_zeroext:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
-; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
-; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off glc dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_i8_zeroext at abs32@hi
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_i8_zeroext at abs32@lo
-; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: test_call_external_void_func_i8_zeroext:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
-; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
-; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_i8_zeroext at abs32@hi
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_i8_zeroext at abs32@lo
-; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_call_external_void_func_i8_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i8_zeroext at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i8_zeroext at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_zeroext:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -1153,63 +1095,34 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: test_call_external_void_func_i16_signext:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
-; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
-; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off glc dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_i16_signext at abs32@hi
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_i16_signext at abs32@lo
-; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: test_call_external_void_func_i16_signext:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
-; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
-; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off glc dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_i16_signext at abs32@hi
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_i16_signext at abs32@lo
-; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_call_external_void_func_i16_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i16_signext at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i16_signext at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_signext:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -1304,63 +1217,34 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: test_call_external_void_func_i16_zeroext:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
-; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
-; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off glc dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_i16_zeroext at abs32@hi
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_i16_zeroext at abs32@lo
-; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: test_call_external_void_func_i16_zeroext:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
-; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
-; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off glc dlc
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_i16_zeroext at abs32@hi
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_i16_zeroext at abs32@lo
-; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_call_external_void_func_i16_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i16_zeroext at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i16_zeroext at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_zeroext:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -3245,71 +3129,38 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: test_call_external_void_func_v2i8:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
-; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
-; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
-; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: test_call_external_void_func_v2i8:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
-; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
-; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
-; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_call_external_void_func_v2i8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i8:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -4297,77 +4148,41 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: test_call_external_void_func_i8_ret:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
-; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
-; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
-; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s0, 2
-; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
-; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT:    global_load_d16_u8 v0, v[40:41], off
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s30, 0
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s31, 1
-; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    global_store_b8 v[40:41], v0, off
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
-; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
-; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: test_call_external_void_func_i8_ret:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
-; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
-; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
-; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s0, 2
-; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
-; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT:    global_load_u8 v0, v[40:41], off
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s30, 0
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s31, 1
-; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT:    global_store_b8 v[40:41], v0, off
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s33
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
-; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
-; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_call_external_void_func_i8_ret:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    global_load_u8 v0, v[40:41], off
+; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    global_store_b8 v[40:41], v0, off
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_ret:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -4512,7 +4327,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v2i8_ret at abs32@hi
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v2i8_ret at abs32@lo
 ; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[40:41], off
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v[40:41], off
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
@@ -9157,71 +8972,38 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: test_call_external_void_func_struct_i8_i32:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
-; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
-; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[0:1]
-; GFX11-TRUE16-NEXT:    global_load_b32 v1, v1, s[0:1] offset:4
-; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_struct_i8_i32 at abs32@hi
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
-; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: test_call_external_void_func_struct_i8_i32:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
-; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
-; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    global_load_u8 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT:    global_load_b32 v1, v1, s[0:1] offset:4
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_struct_i8_i32 at abs32@hi
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
-; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
-; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_call_external_void_func_struct_i8_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_u8 v0, v1, s[0:1]
+; GFX11-NEXT:    global_load_b32 v1, v1, s[0:1] offset:4
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s1, external_void_func_struct_i8_i32 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_struct_i8_i32:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -9540,7 +9322,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, off, s33 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_u8 v0, off, s33 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s33 offset:12
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
index 63376def3d7e1..b361b85cfbd4a 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
@@ -87,7 +87,7 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() #0 {
 }
 
 ; GCN-LABEL: {{^}}queue_ptr:
-; WORKAROUND-TRUE16-SDAG: global_load_d16_u8
+; WORKAROUND-TRUE16-SDAG: global_load_u8
 ; WORKAROUND-FAKE16: global_load_u8 v{{[0-9]+}},
 
 ; WORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s15
@@ -129,9 +129,9 @@ define amdgpu_kernel void @queue_ptr() #1 {
 ; NOWORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s9
 ; NOWORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s10
 
-; WORKAROUND-TRUE16-SDAG: global_load_d16_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1]
-; WORKAROUND-TRUE16-SDAG: global_load_d16_u8 v{{[0-9]+}},
-; WORKAROUND-TRUE16-SDAG: global_load_d16_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[4:5]
+; WORKAROUND-TRUE16-SDAG: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1]
+; WORKAROUND-TRUE16-SDAG: global_load_u8 v{{[0-9]+}},
+; WORKAROUND-TRUE16-SDAG: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[4:5]
 
 ; WORKAROUND-FAKE16: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1]
 ; WORKAROUND-FAKE16: global_load_u8 v{{[0-9]+}},
diff --git a/llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll
index f92ba7a8978b9..e36ee94ad7cd8 100644
--- a/llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll
@@ -8,7 +8,7 @@ define amdgpu_kernel void @zextload_global_i8_to_i16(ptr addrspace(1) %out, ptr
 ; GFX11-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-REAL16-NEXT:    global_load_d16_u8 v0, v1, s[2:3]
+; GFX11-REAL16-NEXT:    global_load_u8 v0, v1, s[2:3]
 ; GFX11-REAL16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-REAL16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-REAL16-NEXT:    s_endpgm
@@ -34,7 +34,7 @@ define amdgpu_kernel void @sextload_global_i8_to_i16(ptr addrspace(1) %out, ptr
 ; GFX11-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-REAL16-NEXT:    global_load_d16_i8 v0, v1, s[2:3]
+; GFX11-REAL16-NEXT:    global_load_i8 v0, v1, s[2:3]
 ; GFX11-REAL16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-REAL16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-REAL16-NEXT:    s_endpgm
@@ -55,27 +55,16 @@ define amdgpu_kernel void @sextload_global_i8_to_i16(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @zextload_global_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
-; GFX11-REAL16-LABEL: zextload_global_i8_to_i64:
-; GFX11-REAL16:       ; %bb.0:
-; GFX11-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-REAL16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-REAL16-NEXT:    global_load_d16_u8 v0, v1, s[2:3]
-; GFX11-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-REAL16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-REAL16-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
-; GFX11-REAL16-NEXT:    s_endpgm
-;
-; GFX11-FAKE16-LABEL: zextload_global_i8_to_i64:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    global_load_u8 v0, v1, s[2:3]
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
-; GFX11-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: zextload_global_i8_to_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v1, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
+; GFX11-NEXT:    s_endpgm
   %a = load i8, ptr addrspace(1) %in
   %ext = zext i8 %a to i64
   store i64 %ext, ptr addrspace(1) %out
@@ -83,31 +72,18 @@ define amdgpu_kernel void @zextload_global_i8_to_i64(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @sextload_global_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
-; GFX11-REAL16-LABEL: sextload_global_i8_to_i64:
-; GFX11-REAL16:       ; %bb.0:
-; GFX11-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-REAL16-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-REAL16-NEXT:    global_load_d16_i8 v0, v2, s[2:3]
-; GFX11-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-REAL16-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX11-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-REAL16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-REAL16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-REAL16-NEXT:    s_endpgm
-;
-; GFX11-FAKE16-LABEL: sextload_global_i8_to_i64:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    global_load_i8 v0, v2, s[2:3]
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-FAKE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: sextload_global_i8_to_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_i8 v0, v2, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_endpgm
   %a = load i8, ptr addrspace(1) %in
   %ext = sext i8 %a to i64
   store i64 %ext, ptr addrspace(1) %out
@@ -147,27 +123,16 @@ define amdgpu_kernel void @sextload_global_i16_to_i32(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @zextload_global_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
-; GFX11-REAL16-LABEL: zextload_global_i16_to_i64:
-; GFX11-REAL16:       ; %bb.0:
-; GFX11-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-REAL16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-REAL16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
-; GFX11-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-REAL16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-REAL16-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
-; GFX11-REAL16-NEXT:    s_endpgm
-;
-; GFX11-FAKE16-LABEL: zextload_global_i16_to_i64:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    global_load_u16 v0, v1, s[2:3]
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
-; GFX11-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: zextload_global_i16_to_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v0, v1, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
+; GFX11-NEXT:    s_endpgm
   %a = load i16, ptr addrspace(1) %in
   %ext = zext i16 %a to i64
   store i64 %ext, ptr addrspace(1) %out
@@ -175,31 +140,18 @@ define amdgpu_kernel void @zextload_global_i16_to_i64(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @sextload_global_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
-; GFX11-REAL16-LABEL: sextload_global_i16_to_i64:
-; GFX11-REAL16:       ; %bb.0:
-; GFX11-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-REAL16-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-REAL16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-REAL16-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX11-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-REAL16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-REAL16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-REAL16-NEXT:    s_endpgm
-;
-; GFX11-FAKE16-LABEL: sextload_global_i16_to_i64:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    global_load_u16 v0, v2, s[2:3]
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-FAKE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: sextload_global_i16_to_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v0, v2, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_endpgm
   %a = load i16, ptr addrspace(1) %in
   %ext = sext i16 %a to i64
   store i64 %ext, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 1602e31d6147c..329e1c3831525 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -5,7 +5,8 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; FIXME-TRUE16 enable gisel
+; XUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 ; Test using saddr addressing mode of global_*load_* flat instructions.
@@ -2290,17 +2291,11 @@ define amdgpu_ps half @global_load_saddr_i16(ptr addrspace(1) inreg %sbase, i32
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 ;
-; GFX11-TRUE16-LABEL: global_load_saddr_i16:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX11-FAKE16-LABEL: global_load_saddr_i16:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+; GFX11-LABEL: global_load_saddr_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_i16:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2333,17 +2328,11 @@ define amdgpu_ps half @global_load_saddr_i16_immneg128(ptr addrspace(1) inreg %s
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 ;
-; GFX11-TRUE16-LABEL: global_load_saddr_i16_immneg128:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX11-FAKE16-LABEL: global_load_saddr_i16_immneg128:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+; GFX11-LABEL: global_load_saddr_i16_immneg128:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_i16_immneg128:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2377,17 +2366,11 @@ define amdgpu_ps half @global_load_saddr_f16(ptr addrspace(1) inreg %sbase, i32
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 ;
-; GFX11-TRUE16-LABEL: global_load_saddr_f16:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX11-FAKE16-LABEL: global_load_saddr_f16:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+; GFX11-LABEL: global_load_saddr_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_f16:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2419,17 +2402,11 @@ define amdgpu_ps half @global_load_saddr_f16_immneg128(ptr addrspace(1) inreg %s
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 ;
-; GFX11-TRUE16-LABEL: global_load_saddr_f16_immneg128:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX11-FAKE16-LABEL: global_load_saddr_f16_immneg128:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+; GFX11-LABEL: global_load_saddr_f16_immneg128:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_f16_immneg128:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -3919,17 +3896,16 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(ptr addrspace(
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_undef_hi:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
 ; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16lo_undef_hi:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_undef_hi:
-; GFX12-GISEL-FAKE16:       ; %bb.0:
-; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -3957,17 +3933,16 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(ptr
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
 ; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
-; GFX12-GISEL-FAKE16:       ; %bb.0:
-; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -4002,6 +3977,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(ptr addrspace(1
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zero_hi:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
 ; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16lo_zero_hi:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
@@ -4010,13 +3991,6 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(ptr addrspace(1
 ; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, 0, 16, v0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_zero_hi:
-; GFX12-GISEL-FAKE16:       ; %bb.0:
-; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -4050,6 +4024,12 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(ptr a
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
 ; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
@@ -4058,13 +4038,6 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(ptr a
 ; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, 0, 16, v0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
-; GFX12-GISEL-FAKE16:       ; %bb.0:
-; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -4324,18 +4297,17 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(ptr addrspace(
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_undef_hi:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
 ; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_undef_hi:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v0, s[2:3]
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_undef_hi:
-; GFX12-GISEL-FAKE16:       ; %bb.0:
-; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -4363,18 +4335,17 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(ptr
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
 ; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
-; GFX12-GISEL-FAKE16:       ; %bb.0:
-; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -4409,19 +4380,18 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zero_hi:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
 ; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, 0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi:
-; GFX12-GISEL-FAKE16:       ; %bb.0:
-; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -4455,19 +4425,18 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr a
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
 ; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
 ; GFX12-GISEL-TRUE16:       ; %bb.0:
 ; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, 0
 ; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
-; GFX12-GISEL-FAKE16:       ; %bb.0:
-; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -5039,3 +5008,7 @@ bb3:                                              ; preds = %bb3, %bb
 
 !0 = !{i32 0, i32 1073741824} ; (1 << 30)
 !1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-FAKE16: {{.*}}
+; GFX11-TRUE16: {{.*}}
+; GFX12-GISEL-FAKE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index da132d0269e6b..cc653a5b4bd97 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -7451,7 +7451,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    global_load_u8 v0, v1, s[0:1] offset:16 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -7529,7 +7529,7 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[0:1] offset:-512 glc
+; GFX11-TRUE16-NEXT:    global_load_u8 v0, v1, s[0:1] offset:-512 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -7722,7 +7722,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] offset:16 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -7800,7 +7800,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] offset:-512 glc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] offset:-512 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -9187,7 +9187,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] offset:16 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -9264,7 +9264,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] offset:-512 glc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] offset:-512 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -9337,7 +9337,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] offset:16 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -9414,7 +9414,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] offset:-512 glc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1] offset:-512 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -9552,6 +9552,47 @@ define amdgpu_kernel void @atomic_sub_i16_soffset__amdgpu_no_remote_memory(ptr a
 ; GFX9-NEXT:    s_cbranch_execnz .LBB136_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_sub_i16_soffset__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_u32 s3, s0, 0x4650
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0
+; GFX11-NEXT:    s_and_b32 s0, s3, -4
+; GFX11-NEXT:    s_and_b32 s3, s3, 3
+; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX11-NEXT:    s_lshl_b32 s5, s3, 3
+; GFX11-NEXT:    s_and_b32 s6, s2, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s2, 0xffff, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_not_b32 s3, s2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_lshl_b32 s4, s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:  .LBB136_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v0, s4, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_cbranch_execnz .LBB136_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i16, ptr addrspace(1) %out, i64 9000
   %val = atomicrmw sub ptr addrspace(1) %gep, i16 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret void
@@ -9671,6 +9712,47 @@ define amdgpu_kernel void @atomic_sub_i8_soffset__amdgpu_no_remote_memory(ptr ad
 ; GFX9-NEXT:    s_cbranch_execnz .LBB137_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: atomic_sub_i8_soffset__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_u32 s3, s0, 0x2328
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0
+; GFX11-NEXT:    s_and_b32 s0, s3, -4
+; GFX11-NEXT:    s_and_b32 s3, s3, 3
+; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX11-NEXT:    s_lshl_b32 s5, s3, 3
+; GFX11-NEXT:    s_and_b32 s6, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s2, 0xff, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_not_b32 s3, s2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    s_lshl_b32 s4, s6, s5
+; GFX11-NEXT:    s_mov_b32 s5, 0
+; GFX11-NEXT:  .LBB137_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v0, s4, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, s2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT:    s_cbranch_execnz .LBB137_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %out, i64 9000
   %val = atomicrmw sub ptr addrspace(1) %gep, i8 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 8e427a6ef2023..1bb4fb30465f3 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -967,7 +967,7 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -1100,12 +1100,12 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr
 ; GFX11-TRUE16-LABEL: global_extload_f16_to_f32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v0, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: global_extload_f16_to_f32:
@@ -1694,7 +1694,7 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v2, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -3512,7 +3512,7 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -3554,7 +3554,7 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
index fc4cdcda99ae4..f7d90cbf45bcb 100644
--- a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
@@ -77,18 +77,18 @@ define amdgpu_kernel void @i16_eq(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[4:5]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -174,18 +174,18 @@ define amdgpu_kernel void @i16_ne(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[4:5]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -271,18 +271,18 @@ define amdgpu_kernel void @i16_ugt(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[4:5]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_u16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_gt_u16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -368,18 +368,18 @@ define amdgpu_kernel void @i16_uge(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[4:5]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_ge_u16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_ge_u16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -465,18 +465,18 @@ define amdgpu_kernel void @i16_ult(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[4:5]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_u16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_lt_u16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -562,18 +562,18 @@ define amdgpu_kernel void @i16_ule(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[4:5]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_le_u16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_le_u16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -660,18 +660,18 @@ define amdgpu_kernel void @i16_sgt(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[4:5]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -757,18 +757,18 @@ define amdgpu_kernel void @i16_sge(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[4:5]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_ge_i16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_ge_i16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -854,18 +854,18 @@ define amdgpu_kernel void @i16_slt(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[4:5]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -951,18 +951,18 @@ define amdgpu_kernel void @i16_sle(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[4:5]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_le_i16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_le_i16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1039,17 +1039,17 @@ define amdgpu_kernel void @i16_eq_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; GFX11-TRUE16-LABEL: i16_eq_v_s:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1123,17 +1123,17 @@ define amdgpu_kernel void @i16_ne_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a
 ; GFX11-TRUE16-LABEL: i16_ne_v_s:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1207,17 +1207,17 @@ define amdgpu_kernel void @i16_ugt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-TRUE16-LABEL: i16_ugt_v_s:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_u16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_lt_u16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1291,17 +1291,17 @@ define amdgpu_kernel void @i16_uge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-TRUE16-LABEL: i16_uge_v_s:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_le_u16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_le_u16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1375,17 +1375,17 @@ define amdgpu_kernel void @i16_ult_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-TRUE16-LABEL: i16_ult_v_s:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_u16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_gt_u16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1459,17 +1459,17 @@ define amdgpu_kernel void @i16_ule_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-TRUE16-LABEL: i16_ule_v_s:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_ge_u16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_ge_u16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1543,17 +1543,17 @@ define amdgpu_kernel void @i16_sgt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-TRUE16-LABEL: i16_sgt_v_s:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1627,17 +1627,17 @@ define amdgpu_kernel void @i16_sge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-TRUE16-LABEL: i16_sge_v_s:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_le_i16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_le_i16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1711,17 +1711,17 @@ define amdgpu_kernel void @i16_slt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-TRUE16-LABEL: i16_slt_v_s:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1795,17 +1795,17 @@ define amdgpu_kernel void @i16_sle_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-TRUE16-LABEL: i16_sle_v_s:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_ge_i16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_ge_i16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index ab38bd21994ec..46e803245433a 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -1159,42 +1159,43 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-LABEL: idot4_acc16_vecMul:
 ; GFX11-DL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-DL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-TRUE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-DL-TRUE16-NEXT:    global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v3, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT:    global_load_u16 v3, v2, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v6, v1, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v2, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v0, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v4.h, 8, v1.l
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v7.h, 8, v2.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v7.h, 8, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v6, v1, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v1.h, 8, v1.h
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v2, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v2.h, 8, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v0, 0, 8
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
+; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v7.h, 8, v0.h
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v6.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v5.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v4.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v4.l, v3.l
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
+; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v7
 ; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v4.h
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
 ; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.h
-; GFX11-DL-TRUE16-NEXT:    global_store_b16 v3, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_store_b16 v2, v0, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DL-FAKE16-LABEL: idot4_acc16_vecMul:
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 305461ed6b208..22060a2d63749 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -1669,37 +1669,35 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-LABEL: notdot4_mixedtypes:
 ; GFX11-DL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-DL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX11-DL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-TRUE16-NEXT:    v_mov_b32_e32 v6, 0
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v4, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v5, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v6, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v4, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT:    global_load_u16 v6, v5, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v5
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v7, v5, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v7, v4, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v4, v4, v4, 0xc0c0302
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v7.l
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.h, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.l, v0.h, v6.l
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
-; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v1, v5, v5, 0xc0c0302
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v2.l, v3.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v2, v4, v4, 0xc0c0302
-; GFX11-DL-TRUE16-NEXT:    v_dot4_u32_u8 v0, v2, v1, v0
-; GFX11-DL-TRUE16-NEXT:    global_store_b16 v6, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.l, v2.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v1, v3, v3, 0xc0c0302
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_dot4_u32_u8 v0, v1, v4, v0
+; GFX11-DL-TRUE16-NEXT:    global_store_b16 v5, v0, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DL-FAKE16-LABEL: notdot4_mixedtypes:
@@ -1969,32 +1967,34 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[2:3]
 ; GFX11-DL-TRUE16-NEXT:    global_load_b32 v4, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v5, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_load_u16 v6, v5, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v4
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v6, v4, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.h
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v6.l
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v7, v4, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.h
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 24, v4
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v2, v0, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v1.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v3.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v8, v8, 0, 8
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v7.l
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.h, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v3.h
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v4
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v2.l, v1.h, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.l, v1.l, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.h
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v8.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v2.l, v0.h, v0.l
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v2, v6, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.l, v0.h, v0.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v3.l, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v4.l, v1.l, v0.l
 ; GFX11-DL-TRUE16-NEXT:    global_store_b16 v5, v0, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_endpgm
 ;
@@ -2438,38 +2438,39 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-LABEL: udot4_acc16_vecMul:
 ; GFX11-DL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-DL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-TRUE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-DL-TRUE16-NEXT:    global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v3, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT:    global_load_u16 v3, v2, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v4.h, 8, v1.l
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v5.h, 8, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v5.h, 8, v0.l
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v1.l
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v2.l
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
 ; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.h
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.h
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v0.h
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v6.l
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v7.l
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v4.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v4.l, v3.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
 ; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v4.h
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.h
-; GFX11-DL-TRUE16-NEXT:    global_store_b16 v3, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_store_b16 v2, v0, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DL-FAKE16-LABEL: udot4_acc16_vecMul:
@@ -2713,44 +2714,46 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-LABEL: udot4_acc8_vecMul:
 ; GFX11-DL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-DL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-TRUE16-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v4, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT:    global_load_d16_u8 v0, v5, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT:    global_load_u8 v5, v4, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v0.h, 8, v3.l
-; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v1.l, v3.h, v4.h
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v1.h, 8, v4.l
-; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v3.l, v4.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v2.l, v2.l, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v0.l, 8, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v0.h, v2.h, v3.h
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v1.l, 8, v3.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v1.h, v6.l, v7.l
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v6.l, 0
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
-; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v0.h, v0.h, v1.h
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.h
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v6.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v0.h
-; GFX11-DL-TRUE16-NEXT:    v_or_b16 v6.h, v1.l, v2.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v6
+; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v0.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_or_b16 v6.h, v0.h, v1.l
 ; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
-; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v3.h, v4.h, v0.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_or_b32_e32 v0, v7, v6
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 8, v0
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v2.l, v3.l, v5.l
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v6.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v2.h, v3.h, v0.l
 ; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
-; GFX11-DL-TRUE16-NEXT:    global_store_b8 v5, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_store_b8 v4, v0, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DL-FAKE16-LABEL: udot4_acc8_vecMul:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index 4419b8c6f9862..57db2c94ce908 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -13,7 +13,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
+; SDAG-GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[6:7]
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -51,14 +51,14 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp(
 ; SDAG-GFX11-TRUE16:       ; %bb.0: ; %entry
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s1
-; SDAG-GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s2
-; SDAG-GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s3
+; SDAG-GFX11-TRUE16-NEXT:    scratch_load_b32 v0, off, s1
+; SDAG-GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s2
+; SDAG-GFX11-TRUE16-NEXT:    scratch_load_u16 v2, off, s3
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-TRUE16-NEXT:    v_dot2_bf16_bf16 v0.l, v1, v2, v0.l
+; SDAG-GFX11-TRUE16-NEXT:    v_dot2_bf16_bf16 v0.l, v0, v1, v2.l
 ; SDAG-GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s0
 ; SDAG-GFX11-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
index 0194d25a99cdc..61941e29495cc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16
+; FIXME-TRUE16 enable gisel
+; XUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16
 
 declare half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a, <2 x half> %b, half %c)
@@ -12,7 +13,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
+; SDAG-GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[6:7]
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -33,6 +34,18 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; SDAG-GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; SDAG-GFX11-FAKE16-NEXT:    s_endpgm
 ;
+; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16:
+; GISEL-GFX11:       ; %bb.0: ; %entry
+; GISEL-GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    global_load_u16 v1, v0, s[6:7]
+; GISEL-GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GISEL-GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_dot2_f16_f16 v1, s2, s3, v1
+; GISEL-GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GISEL-GFX11-NEXT:    s_endpgm
 ; GISEL-GFX11-TRUE16-LABEL: test_llvm_amdgcn_fdot2_f16_f16:
 ; GISEL-GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GISEL-GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
@@ -45,19 +58,6 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; GISEL-GFX11-TRUE16-NEXT:    v_dot2_f16_f16 v0.l, s2, s3, v0.l
 ; GISEL-GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GISEL-GFX11-TRUE16-NEXT:    s_endpgm
-;
-; GISEL-GFX11-FAKE16-LABEL: test_llvm_amdgcn_fdot2_f16_f16:
-; GISEL-GFX11-FAKE16:       ; %bb.0: ; %entry
-; GISEL-GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GISEL-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
-; GISEL-GFX11-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GISEL-GFX11-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
-; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GISEL-GFX11-FAKE16-NEXT:    v_dot2_f16_f16 v1, s2, s3, v1
-; GISEL-GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GISEL-GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b,
@@ -76,14 +76,14 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp(
 ; SDAG-GFX11-TRUE16:       ; %bb.0: ; %entry
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s1
-; SDAG-GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s2
-; SDAG-GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s3
+; SDAG-GFX11-TRUE16-NEXT:    scratch_load_b32 v0, off, s1
+; SDAG-GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s2
+; SDAG-GFX11-TRUE16-NEXT:    scratch_load_u16 v2, off, s3
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-TRUE16-NEXT:    v_dot2_f16_f16 v0.l, v1, v2, v0.l
+; SDAG-GFX11-TRUE16-NEXT:    v_dot2_f16_f16 v0.l, v0, v1, v2.l
 ; SDAG-GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s0
 ; SDAG-GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -99,6 +99,17 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp(
 ; SDAG-GFX11-FAKE16-NEXT:    scratch_store_b16 off, v0, s0
 ; SDAG-GFX11-FAKE16-NEXT:    s_endpgm
 ;
+; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp:
+; GISEL-GFX11:       ; %bb.0: ; %entry
+; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-NEXT:    scratch_load_b32 v0, off, s1
+; GISEL-GFX11-NEXT:    scratch_load_b32 v1, off, s2
+; GISEL-GFX11-NEXT:    scratch_load_u16 v2, off, s3
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-GFX11-NEXT:    v_dot2_f16_f16_e64_dpp v0, v0, v1, v2 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GISEL-GFX11-NEXT:    scratch_store_b16 off, v0, s0
+; GISEL-GFX11-NEXT:    s_endpgm
 ; GISEL-GFX11-TRUE16-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp:
 ; GISEL-GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GISEL-GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -113,18 +124,6 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp(
 ; GISEL-GFX11-TRUE16-NEXT:    v_dot2_f16_f16 v0.l, v1, v2, v0.l
 ; GISEL-GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s0
 ; GISEL-GFX11-TRUE16-NEXT:    s_endpgm
-;
-; GISEL-GFX11-FAKE16-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp:
-; GISEL-GFX11-FAKE16:       ; %bb.0: ; %entry
-; GISEL-GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s1
-; GISEL-GFX11-FAKE16-NEXT:    scratch_load_b32 v1, off, s2
-; GISEL-GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s3
-; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-GFX11-FAKE16-NEXT:    v_dot2_f16_f16_e64_dpp v0, v0, v1, v2 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GISEL-GFX11-FAKE16-NEXT:    scratch_store_b16 off, v0, s0
-; GISEL-GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(5) %r,
     ptr addrspace(5) %a,
     ptr addrspace(5) %b,
@@ -144,5 +143,5 @@ entry:
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1)
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11: {{.*}}
-; GISEL-GFX11: {{.*}}
+; GISEL-GFX11-FAKE16: {{.*}}
 ; SDAG-GFX11: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 7d63e22d84b72..47693767e7d6c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -76,7 +76,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0.15915494, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index ba03115c51536..86fcb29776240 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -76,7 +76,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0.15915494, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index fbf8011fd40c9..3915ece69f366 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -69,18 +69,18 @@ define amdgpu_kernel void @mad_u16(
 ; GFX11-TRUE16-LABEL: mad_u16:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v0, s[4:5] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v1, v1, s[6:7] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v0, s[6:7] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mad_u16 v0.l, v0.l, v0.h, v1.l
-; GFX11-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_mad_u16 v0.l, v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v3, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: mad_u16:
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 311527d5d04cc..39d73a293647a 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -1458,10 +1458,10 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[4:5]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v1, s[4:5]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v2.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -2747,10 +2747,10 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_u8 v0, v1, s[4:5]
+; GFX11-TRUE16-NEXT:    global_load_u8 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_u8 v2, v1, s[4:5]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v2.l
 ; GFX11-TRUE16-NEXT:    global_store_b8 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -3176,47 +3176,26 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; GFX10-NEXT:    global_store_byte v0, v2, s[2:3]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-TRUE16-LABEL: v_test_umin_ult_i16_multi_use:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v1, v0, s[6:7]
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v2, v0, s[4:5]
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-TRUE16-NEXT:    global_store_b8 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-FAKE16-LABEL: v_test_umin_ult_i16_multi_use:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
-; GFX11-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5]
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_clause 0x1
-; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT:    global_store_b8 v0, v2, s[2:3]
-; GFX11-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: v_test_umin_ult_i16_multi_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_u16 v1, v0, s[6:7]
+; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5]
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: v_test_umin_ult_i16_multi_use:
 ; GFX1250:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index a3c38b17abf00..08a9ea985e459 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -32,19 +32,12 @@ define i8 @flat_inst_valu_offset_1(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:1
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:1
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -68,13 +61,6 @@ define i8 @flat_inst_valu_offset_1(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:1
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -107,19 +93,12 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2047
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -143,13 +122,6 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_11bit_max:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -182,19 +154,12 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -218,13 +183,6 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_12bit_max:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -259,25 +217,15 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_13bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -354,25 +302,15 @@ define i8 @flat_inst_valu_offset_24bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_24bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_24bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_24bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_24bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -449,25 +387,15 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -491,16 +419,6 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_neg_11bit_max:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -535,25 +453,15 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -577,16 +485,6 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_neg_12bit_max:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -621,25 +519,15 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -663,16 +551,6 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_neg_13bit_max:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -707,25 +585,15 @@ define i8 @flat_inst_valu_offset_neg_24bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_24bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_24bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_neg_24bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_24bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -749,16 +617,6 @@ define i8 @flat_inst_valu_offset_neg_24bit_max(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_24bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_neg_24bit_max:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -792,19 +650,12 @@ define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -828,13 +679,6 @@ define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_11bit_max:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -869,25 +713,15 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -964,25 +798,15 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1059,25 +883,15 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_24bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4094
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_24bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4094
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_24bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4094
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_24bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1163,25 +977,15 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1205,16 +1009,6 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1249,25 +1043,15 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1291,16 +1075,6 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1335,25 +1109,15 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1377,16 +1141,6 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1421,25 +1175,15 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1469,16 +1213,6 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1517,25 +1251,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2047
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1622,25 +1346,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2048
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2048
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2048
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1727,25 +1441,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1832,25 +1536,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1880,16 +1574,6 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1928,25 +1612,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2033,25 +1707,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2081,16 +1745,6 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2130,25 +1784,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2188,16 +1832,6 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2237,25 +1871,15 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2295,16 +1919,6 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2344,25 +1958,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2402,16 +2006,6 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2451,25 +2045,15 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2509,16 +2093,6 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2558,25 +2132,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2616,16 +2180,6 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2665,25 +2219,15 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2723,16 +2267,6 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2776,25 +2310,15 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:1 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:1 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: flat_inst_salu_offset_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:1 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2816,16 +2340,6 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: flat_inst_salu_offset_1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:1 glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-GISEL-NEXT:    s_endpgm
-;
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -2866,25 +2380,15 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2047 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: flat_inst_salu_offset_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2906,16 +2410,6 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: flat_inst_salu_offset_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-GISEL-NEXT:    s_endpgm
-;
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_11bit_max:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -2956,25 +2450,15 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: flat_inst_salu_offset_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2996,16 +2480,6 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: flat_inst_salu_offset_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-GISEL-NEXT:    s_endpgm
-;
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_12bit_max:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -3048,29 +2522,17 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_13bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -3160,29 +2622,17 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xfffff800, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_neg_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xfffff800, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xfffff800, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -3272,29 +2722,17 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_neg_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xfffff000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -3384,29 +2822,17 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_neg_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -3494,25 +2920,15 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-LABEL: flat_inst_salu_offset_2x_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -3534,16 +2950,6 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
 ; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-GISEL-NEXT:    s_endpgm
-;
 ; GFX12-GISEL-LABEL: flat_inst_salu_offset_2x_11bit_max:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -3586,29 +2992,17 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -3698,29 +3092,17 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x3000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x3000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x3000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -3810,29 +3192,17 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xfffff000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -3922,29 +3292,17 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4034,29 +3392,17 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xffffc000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xffffc000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xffffc000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4146,29 +3492,17 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2047 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4265,29 +3599,17 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2048 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2048 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2048 glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4384,29 +3706,17 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4504,29 +3814,17 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4624,29 +3922,17 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4744,29 +4030,17 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x2000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x2000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x2000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4865,31 +4139,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4990,31 +4251,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -5115,31 +4363,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -5240,31 +4475,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -5365,31 +4587,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -5490,31 +4699,18 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -5588,10 +4784,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX10-GISEL: {{.*}}
 ; GFX10-SDAG: {{.*}}
-; GFX11: {{.*}}
 ; GFX11-GISEL-FAKE16: {{.*}}
 ; GFX11-GISEL-TRUE16: {{.*}}
-; GFX11-SDAG: {{.*}}
+; GFX11-SDAG-FAKE16: {{.*}}
+; GFX11-SDAG-TRUE16: {{.*}}
 ; GFX12: {{.*}}
 ; GFX12-GISEL-FAKE16: {{.*}}
 ; GFX12-GISEL-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index 20916a9a51d9e..4f230b2231ef2 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -30,12 +30,12 @@ define i8 @global_inst_valu_offset_1(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:1
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -48,20 +48,6 @@ define i8 @global_inst_valu_offset_1(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:1
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:1
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -103,12 +89,12 @@ define i8 @global_inst_valu_offset_11bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_11bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -121,20 +107,6 @@ define i8 @global_inst_valu_offset_11bit_max(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:2047
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -178,12 +150,12 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) {
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_12bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -205,20 +177,6 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -303,25 +261,15 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_13bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -407,25 +355,15 @@ define i8 @global_inst_valu_offset_24bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_24bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_24bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_24bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_24bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -468,12 +406,12 @@ define i8 @global_inst_valu_offset_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_neg_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_neg_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_neg_11bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -486,20 +424,6 @@ define i8 @global_inst_valu_offset_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-2048
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -543,12 +467,12 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_neg_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_neg_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_neg_12bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -561,20 +485,6 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-4096
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -620,15 +530,15 @@ define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_neg_13bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -641,26 +551,6 @@ define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -706,15 +596,15 @@ define i8 @global_inst_valu_offset_neg_24bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_neg_24bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_neg_24bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_neg_24bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -727,26 +617,6 @@ define i8 @global_inst_valu_offset_neg_24bit_max(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_24bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_24bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_24bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -790,12 +660,12 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_2x_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_11bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -817,20 +687,6 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -915,25 +771,15 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_12bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1019,25 +865,15 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_13bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1126,25 +962,15 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_24bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4094
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_24bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4094
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4094
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_24bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1195,12 +1021,12 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1213,20 +1039,6 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-4096
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1272,15 +1084,15 @@ define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1293,26 +1105,6 @@ define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1358,15 +1150,15 @@ define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1379,26 +1171,6 @@ define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1486,25 +1258,15 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff001000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff001000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff001000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1601,25 +1363,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:2047
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1706,25 +1458,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:2048
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:2048
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2048
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1820,25 +1562,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -1892,15 +1624,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1916,26 +1648,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2030,25 +1742,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2102,15 +1804,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -2126,26 +1828,6 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2242,25 +1924,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-2049
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-2049
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-2049
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2349,25 +2021,15 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-2048
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2465,25 +2127,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-1
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2538,15 +2190,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -2572,26 +2224,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2688,25 +2320,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-1
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -2761,15 +2383,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -2795,26 +2417,6 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2868,15 +2470,15 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:1 glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:1 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -2888,26 +2490,6 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:1 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:1 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -2954,15 +2536,15 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_11bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -2974,26 +2556,6 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:2047 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -3040,15 +2602,15 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_12bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -3060,26 +2622,6 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -3126,15 +2668,15 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1000
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_13bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x1000
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_13bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -3146,26 +2688,6 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0x1000
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x1000
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -3212,15 +2734,15 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_neg_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_neg_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_neg_11bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -3232,26 +2754,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:-2048 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_neg_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -3301,15 +2803,15 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
 ; GFX10-GISEL-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_neg_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_neg_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_neg_12bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -3332,26 +2834,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:-4096 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_neg_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -3437,29 +2919,17 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_neg_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -3507,15 +2977,15 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1)
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_2x_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_11bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -3527,26 +2997,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -3593,15 +3043,15 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1)
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1000
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_2x_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x1000
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_12bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -3613,26 +3063,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0x1000
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x1000
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -3679,15 +3109,15 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1)
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3000
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_2x_13bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3000
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_13bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -3699,26 +3129,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0x3000
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x3000
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -3768,15 +3178,15 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
 ; GFX10-GISEL-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -3799,26 +3209,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:-4096 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -3904,29 +3294,17 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4013,29 +3391,17 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xffffc000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xffffc000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xffffc000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4126,29 +3492,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:2047 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:2047 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2047 glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4243,29 +3597,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:2048 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:2048 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2048 glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4360,29 +3702,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4477,29 +3807,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4594,29 +3912,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4711,29 +4017,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x2000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x2000, s0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x2000, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
@@ -4790,17 +4084,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_add_u32 s0, s0, 0x7ff
-; GFX11-GISEL-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_u32 s0, s0, 0x7ff
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
@@ -4814,30 +4108,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_add_u32 s0, s0, 0x7ff
-; GFX11-SDAG-TRUE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_add_u32 s0, s0, 0x7ff
-; GFX11-SDAG-FAKE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -4895,17 +4165,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_add_u32 s0, s0, 0x800
-; GFX11-GISEL-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_u32 s0, s0, 0x800
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -4919,30 +4189,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_add_u32 s0, s0, 0x800
-; GFX11-SDAG-TRUE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_add_u32 s0, s0, 0x800
-; GFX11-SDAG-FAKE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -5000,17 +4246,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_add_u32 s0, s0, 0xfff
-; GFX11-GISEL-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_u32 s0, s0, 0xfff
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
@@ -5024,30 +4270,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_add_u32 s0, s0, 0xfff
-; GFX11-SDAG-TRUE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_add_u32 s0, s0, 0xfff
-; GFX11-SDAG-FAKE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -5105,17 +4327,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_add_u32 s0, s0, 0x1000
-; GFX11-GISEL-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_u32 s0, s0, 0x1000
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -5129,30 +4351,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_add_u32 s0, s0, 0x1000
-; GFX11-SDAG-TRUE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_add_u32 s0, s0, 0x1000
-; GFX11-SDAG-FAKE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -5210,17 +4408,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_u32 s0, s0, 0x1fff
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
@@ -5234,30 +4432,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_add_u32 s0, s0, 0x1fff
-; GFX11-SDAG-TRUE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_add_u32 s0, s0, 0x1fff
-; GFX11-SDAG-FAKE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -5315,17 +4489,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_add_u32 s0, s0, 0x2000
-; GFX11-GISEL-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_u32 s0, s0, 0x2000
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -5339,30 +4513,6 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    s_add_u32 s0, s0, 0x2000
-; GFX11-SDAG-TRUE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    s_add_u32 s0, s0, 0x2000
-; GFX11-SDAG-FAKE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -5394,10 +4544,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
   ret void
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX11: {{.*}}
 ; GFX11-GISEL-FAKE16: {{.*}}
 ; GFX11-GISEL-TRUE16: {{.*}}
-; GFX11-SDAG: {{.*}}
+; GFX11-SDAG-FAKE16: {{.*}}
+; GFX11-SDAG-TRUE16: {{.*}}
 ; GFX12: {{.*}}
 ; GFX12-GISEL-FAKE16: {{.*}}
 ; GFX12-GISEL-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index b1e05158b6212..1419529644cfd 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -2641,7 +2641,7 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-TRUE16-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
 ; GFX11-TRUE16-NEXT:  .LBB8_1: ; %branch
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 25020673bce22..638cd0d0c5181 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -374,15 +374,15 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; GFX11-TRUE16-LABEL: test_rotl_i16:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v2, v[2:3], off offset:48
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off offset:32
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v[2:3], off offset:48
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v[0:1], off offset:32
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.h, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, 0, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, v2.l, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v0.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.h, v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v[4:5], v0, off offset:8
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index 74ac181c120b5..042b9f7c85d45 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -331,15 +331,15 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; GFX11-TRUE16-LABEL: test_rotr_i16:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v2, v[2:3], off offset:48
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off offset:32
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v[2:3], off offset:48
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v[0:1], off offset:32
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.h, 0, v2.l
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, 0, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v0.h, v2.l, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.h, v0.l
 ; GFX11-TRUE16-NEXT:    global_store_b16 v[4:5], v0, off offset:8
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 91c88ec5e718c..9a0a2ee16db1a 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -9,8 +9,9 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; FIXME-TRUE16 enable gisel
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
 
 ; Test that add/sub with a constant is swapped to sub/add with negated
 ; constant to minimize code size.
@@ -128,19 +129,18 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_i32_x_sub_64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_endpgm
-;
+; GFX11-LABEL: v_test_i32_x_sub_64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
 ; GFX11-GISEL-LABEL: v_test_i32_x_sub_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -324,25 +324,24 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
 ; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_i32_x_sub_64_multi_use:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v2, v0, s[2:3] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
-; GFX11-SDAG-NEXT:    v_subrev_nc_u32_e32 v2, 64, v2
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
-; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
-; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
+; GFX11-LABEL: v_test_i32_x_sub_64_multi_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v0, s[2:3] glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 64, v1
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 64, v2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
 ; GFX11-GISEL-LABEL: v_test_i32_x_sub_64_multi_use:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1340,7 +1339,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-SDAG-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, 64
 ; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -1358,7 +1357,6 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-SDAG-FAKE16-NEXT:    v_sub_nc_u16 v1, v1, 64
 ; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX11-GISEL-TRUE16-LABEL: v_test_i16_x_sub_64:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1371,7 +1369,6 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0xffc0, v0.l
 ; GFX11-GISEL-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
-;
 ; GFX11-GISEL-FAKE16-LABEL: v_test_i16_x_sub_64:
 ; GFX11-GISEL-FAKE16:       ; %bb.0:
 ; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1522,16 +1519,16 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
 ; GFX11-SDAG-TRUE16-LABEL: v_test_i16_x_sub_64_zext_to_i32:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
+; GFX11-SDAG-TRUE16-NEXT:    global_load_u16 v1, v1, s[2:3]
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, 64
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_sub_nc_u16 v1.l, v1.l, 64
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-FAKE16-LABEL: v_test_i16_x_sub_64_zext_to_i32:
@@ -1549,7 +1546,6 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
 ; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-SDAG-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX11-GISEL-TRUE16-LABEL: v_test_i16_x_sub_64_zext_to_i32:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1564,7 +1560,6 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
 ; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0xffc0, v0.l
 ; GFX11-GISEL-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
-;
 ; GFX11-GISEL-FAKE16-LABEL: v_test_i16_x_sub_64_zext_to_i32:
 ; GFX11-GISEL-FAKE16:       ; %bb.0:
 ; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1759,12 +1754,12 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    global_load_u16 v0, v1, s[2:3] glc dlc
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[2:3] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    global_load_u16 v2, v1, s[2:3] glc dlc
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, 64
-; GFX11-SDAG-TRUE16-NEXT:    v_sub_nc_u16 v0.h, v0.h, 64
+; GFX11-SDAG-TRUE16-NEXT:    v_sub_nc_u16 v0.h, v2.l, 64
 ; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1] dlc
@@ -1789,7 +1784,6 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
 ; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v0, v2, s[0:1] dlc
 ; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
-;
 ; GFX11-GISEL-TRUE16-LABEL: v_test_i16_x_sub_64_multi_use:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1808,7 +1802,6 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
 ; GFX11-GISEL-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1] dlc
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
-;
 ; GFX11-GISEL-FAKE16-LABEL: v_test_i16_x_sub_64_multi_use:
 ; GFX11-GISEL-FAKE16:       ; %bb.0:
 ; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -3648,19 +3641,18 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
 ; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1]
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_endpgm
-;
+; GFX11-LABEL: v_test_v2i16_x_add_neg_fpone:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
 ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -3813,19 +3805,18 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
 ; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_endpgm
-;
+; GFX11-LABEL: v_test_v2i16_x_add_neg_negfpone:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
 ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4203,19 +4194,18 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_endpgm
-;
+; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
 ; GFX11-GISEL-TRUE16-LABEL: v_test_v2i16_x_add_undef_neg32:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4229,7 +4219,6 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
 ; GFX11-GISEL-TRUE16-NEXT:    v_pk_add_u16 v1, v1, s2
 ; GFX11-GISEL-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
-;
 ; GFX11-GISEL-FAKE16-LABEL: v_test_v2i16_x_add_undef_neg32:
 ; GFX11-GISEL-FAKE16:       ; %bb.0:
 ; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4371,19 +4360,18 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
 ; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_sub_u16 v1, v1, 32
-; GFX11-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-SDAG-NEXT:    s_endpgm
-;
+; GFX11-LABEL: v_test_v2i16_x_add_neg32_undef:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_sub_u16 v1, v1, 32
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
 ; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4410,3 +4398,5 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll
index a9fb77904c641..3c774c521e3e2 100644
--- a/llvm/test/CodeGen/AMDGPU/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/smed3.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16,-d16-hw-bug < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll
index 3d21860e2af40..9708a359360b3 100644
--- a/llvm/test/CodeGen/AMDGPU/spillv16.ll
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll
@@ -6,7 +6,7 @@ define void @spill_i16_alu() {
 ; GCN-TRUE16-LABEL: spill_i16_alu:
 ; GCN-TRUE16:       ; %bb.0: ; %entry
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
 ; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
@@ -52,13 +52,13 @@ define void @spill_i16_alu_two_vals() {
 ; GCN-TRUE16-LABEL: spill_i16_alu_two_vals:
 ; GCN-TRUE16:       ; %bb.0: ; %entry
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
 ; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill
 ; GCN-TRUE16-NEXT:    ;;#ASMSTART
 ; GCN-TRUE16-NEXT:    ;;#ASMEND
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc
+; GCN-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 offset:4 glc dlc
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-TRUE16-NEXT:    scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload
 ; GCN-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x7b, v0.l
@@ -113,33 +113,19 @@ entry:
 ; Tests after this do not actually test 16 bit spills because there is no use of VGPR_16. They could demonstrate 16 bit spills if we update the instructions to use VGPR_16 instead of VGPR_32
 
 define void @spill_i16() {
-; GCN-TRUE16-LABEL: spill_i16:
-; GCN-TRUE16:       ; %bb.0: ; %entry
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT:    ;;#ASMSTART
-; GCN-TRUE16-NEXT:    ;;#ASMEND
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
-; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-FAKE16-LABEL: spill_i16:
-; GCN-FAKE16:       ; %bb.0: ; %entry
-; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
-; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
-; GCN-FAKE16-NEXT:    ;;#ASMSTART
-; GCN-FAKE16-NEXT:    ;;#ASMEND
-; GCN-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
-; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
-; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: spill_i16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    scratch_store_b16 off, v0, s32 dlc
+; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %alloca = alloca i16, i32 1, align 4, addrspace(5)
 
@@ -156,33 +142,19 @@ entry:
 }
 
 define void @spill_half() {
-; GCN-TRUE16-LABEL: spill_half:
-; GCN-TRUE16:       ; %bb.0: ; %entry
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT:    ;;#ASMSTART
-; GCN-TRUE16-NEXT:    ;;#ASMEND
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
-; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-FAKE16-LABEL: spill_half:
-; GCN-FAKE16:       ; %bb.0: ; %entry
-; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
-; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
-; GCN-FAKE16-NEXT:    ;;#ASMSTART
-; GCN-FAKE16-NEXT:    ;;#ASMEND
-; GCN-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
-; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
-; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: spill_half:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    scratch_store_b16 off, v0, s32 dlc
+; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %alloca = alloca half, i32 1, align 4, addrspace(5)
 
@@ -199,33 +171,19 @@ entry:
 }
 
 define void @spill_i16_from_v2i16() {
-; GCN-TRUE16-LABEL: spill_i16_from_v2i16:
-; GCN-TRUE16:       ; %bb.0: ; %entry
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT:    ;;#ASMSTART
-; GCN-TRUE16-NEXT:    ;;#ASMEND
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 dlc
-; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-FAKE16-LABEL: spill_i16_from_v2i16:
-; GCN-FAKE16:       ; %bb.0: ; %entry
-; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:2 glc dlc
-; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FAKE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
-; GCN-FAKE16-NEXT:    ;;#ASMSTART
-; GCN-FAKE16-NEXT:    ;;#ASMEND
-; GCN-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
-; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-FAKE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 dlc
-; GCN-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: spill_i16_from_v2i16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    scratch_load_u16 v0, off, s32 offset:2 glc dlc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
 
@@ -245,19 +203,19 @@ define void @spill_2xi16_from_v2i16() {
 ; GCN-TRUE16-LABEL: spill_2xi16_from_v2i16:
 ; GCN-TRUE16:       ; %bb.0: ; %entry
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GCN-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 offset:2 glc dlc
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:12 ; 4-byte Folded Spill
+; GCN-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
 ; GCN-TRUE16-NEXT:    ;;#ASMSTART
 ; GCN-TRUE16-NEXT:    ;;#ASMEND
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:12 ; 4-byte Folded Reload
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 dlc
 ; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
 ; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -306,19 +264,17 @@ define void @spill_2xi16_from_v2i16_one_free_reg() {
 ; GCN-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
 ; GCN-TRUE16:       ; %bb.0: ; %entry
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GCN-TRUE16-NEXT:    scratch_load_u16 v7, off, s32 offset:2 glc dlc
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT:    scratch_load_u16 v0, off, s32 glc dlc
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT:    scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
 ; GCN-TRUE16-NEXT:    ;;#ASMSTART
 ; GCN-TRUE16-NEXT:    ;;#ASMEND
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v7.l
 ; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 offset:2 dlc
 ; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT:    scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
 ; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-TRUE16-NEXT:    scratch_store_b16 off, v0, s32 dlc
 ; GCN-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
index 40aac82888de2..aed966e5728dd 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
@@ -506,7 +506,7 @@ define float @v_constrained_fpext_f16_to_f32_noabi(ptr addrspace(1) %ptr) #0 {
 ; GFX11-TRUE16-LABEL: v_constrained_fpext_f16_to_f32_noabi:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index e1574dcd45462..93ffa6613b363 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -739,7 +739,7 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_d16_b16 v1, v0, s[4:5]
+; GFX11-NEXT:    global_load_u16 v1, v0, s[4:5]
 ; GFX11-NEXT:    global_load_u16 v2, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v2, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll
index 9d8a45ada87aa..9b33b25b33846 100644
--- a/llvm/test/CodeGen/AMDGPU/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/umed3.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16,-d16-hw-bug < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 0289dab4588a2..ba441a0029b51 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -738,7 +738,7 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_d16_b16 v1, v0, s[4:5]
+; GFX11-NEXT:    global_load_u16 v1, v0, s[4:5]
 ; GFX11-NEXT:    global_load_u16 v2, v0, s[6:7]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index b314cf2e1d9cc..8c1bc1e0be808 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -2232,11 +2232,10 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -2244,10 +2243,11 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
 ; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-TRUE16-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0x7fff, v0.l
-; GFX11-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, s[2:3]
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0x7fff, v1.l
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.h, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v0.l, s[2:3]
 ; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index d8044139aceb3..bad7ea716ca48 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -4,7 +4,8 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-FAKE16 %s
 ; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-REAL16 %s
-; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-REAL16 %s
+; FIXME-TRUE16 enable gisel
+; XUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-REAL16 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
@@ -88,21 +89,20 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
 ; GFX11-GCN-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-GCN-REAL16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GCN-REAL16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-GCN-REAL16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GCN-REAL16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT:    global_load_d16_hi_b16 v0, v1, s[2:3] glc dlc
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
-; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
+; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v1.l
+; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v2.l
 ; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GCN-REAL16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMSTART
 ; GFX11-GCN-REAL16-NEXT:    ; use v0
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMEND
 ; GFX11-GCN-REAL16-NEXT:    s_endpgm
-;
 ; GFX11-GISEL-REAL16-LABEL: v_pack_b32_v2f16:
 ; GFX11-GISEL-REAL16:       ; %bb.0:
 ; GFX11-GISEL-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -217,21 +217,20 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
 ; GFX11-GCN-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-GCN-REAL16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GCN-REAL16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-GCN-REAL16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GCN-REAL16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT:    global_load_d16_hi_b16 v0, v1, s[2:3] glc dlc
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT:    v_subrev_f16_e32 v0.l, 2.0, v0.l
-; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
+; GFX11-GCN-REAL16-NEXT:    v_subrev_f16_e32 v0.l, 2.0, v1.l
+; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v2.l
 ; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GCN-REAL16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMSTART
 ; GFX11-GCN-REAL16-NEXT:    ; use v0
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMEND
 ; GFX11-GCN-REAL16-NEXT:    s_endpgm
-;
 ; GFX11-GISEL-REAL16-LABEL: v_pack_b32_v2f16_sub:
 ; GFX11-GISEL-REAL16:       ; %bb.0:
 ; GFX11-GISEL-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -357,7 +356,6 @@ define amdgpu_kernel void @fptrunc(
 ; GFX11-GCN-REAL16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-GCN-REAL16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-GCN-REAL16-NEXT:    s_endpgm
-;
 ; GFX11-GISEL-REAL16-LABEL: fptrunc:
 ; GFX11-GISEL-REAL16:       ; %bb.0:
 ; GFX11-GISEL-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -460,21 +458,20 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
 ; GFX11-GCN-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-GCN-REAL16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GCN-REAL16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-GCN-REAL16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GCN-REAL16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT:    global_load_d16_hi_b16 v0, v1, s[2:3] glc dlc
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
-; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
+; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v1.l
+; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v2.l
 ; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GCN-REAL16-NEXT:    v_pack_b32_f16 v0, |v0.l|, |v0.h|
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMSTART
 ; GFX11-GCN-REAL16-NEXT:    ; use v0
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMEND
 ; GFX11-GCN-REAL16-NEXT:    s_endpgm
-;
 ; GFX11-GISEL-REAL16-LABEL: v_pack_b32.fabs:
 ; GFX11-GISEL-REAL16:       ; %bb.0:
 ; GFX11-GISEL-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -591,21 +588,20 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(
 ; GFX11-GCN-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-GCN-REAL16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GCN-REAL16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-GCN-REAL16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GCN-REAL16-NEXT:    global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT:    global_load_d16_hi_b16 v0, v1, s[2:3] glc dlc
+; GFX11-GCN-REAL16-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
-; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
+; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v1.l
+; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v2.l
 ; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GCN-REAL16-NEXT:    v_pack_b32_f16 v0, -v0.l, -v0.h
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMSTART
 ; GFX11-GCN-REAL16-NEXT:    ; use v0
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMEND
 ; GFX11-GCN-REAL16-NEXT:    s_endpgm
-;
 ; GFX11-GISEL-REAL16-LABEL: v_pack_b32.fneg:
 ; GFX11-GISEL-REAL16:       ; %bb.0:
 ; GFX11-GISEL-REAL16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
index 587f5d05d358b..7ddf7cac05a8a 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
@@ -26,7 +26,7 @@ define <2 x i8> @shuffle_v2i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX11-TRUE16-LABEL: shuffle_v2i8_rebroadcast:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v1.l, 8, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 11d724eda547e..babd6edc1cc44 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -3017,23 +3017,14 @@ define void @shuffle_v4i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1,
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: shuffle_v4i8_concat:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
-; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v[2:3], off
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    global_store_b32 v[4:5], v0, off
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: shuffle_v4i8_concat:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-FAKE16-NEXT:    global_load_d16_hi_b16 v0, v[2:3], off
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    global_store_b32 v[4:5], v0, off
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: shuffle_v4i8_concat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[4:5], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <2 x i8>, ptr addrspace(1) %arg0
   %val1 = load <2 x i8>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <2 x i8> %val0, <2 x i8> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 04a5cac116d78..0e662850478be 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -398,7 +398,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x3e7, v0.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, 4



More information about the llvm-commits mailing list