[llvm] add d16 predicate (PR #156574)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 2 20:01:44 PDT 2025
https://github.com/broxigarchen created https://github.com/llvm/llvm-project/pull/156574
None
>From 70b0ffad87bc4fc5cff3b08d211acc7a9bdf0d1d Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 2 Sep 2025 23:00:10 -0400
Subject: [PATCH] add d16 predicate
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 13 +
.../Target/AMDGPU/AMDGPUPredicateControl.td | 8 +-
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 4 +
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 +
llvm/lib/Target/AMDGPU/FLATInstructions.td | 164 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 27542 ++++++----------
.../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll | 56 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 586 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 10778 +++---
llvm/test/CodeGen/AMDGPU/bf16.ll | 217 +-
.../branch-relaxation-inst-size-gfx11.ll | 10 +-
.../CodeGen/AMDGPU/call-argument-types.ll | 2 +-
llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 193 +-
llvm/test/CodeGen/AMDGPU/clamp-modifier.ll | 24 +-
llvm/test/CodeGen/AMDGPU/clamp.ll | 12 +-
llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 309 +-
llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 8 +-
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 82 +-
.../test/CodeGen/AMDGPU/flat-address-space.ll | 2 +-
llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 16 +-
llvm/test/CodeGen/AMDGPU/flat_atomics.ll | 413 +-
llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll | 2 +-
llvm/test/CodeGen/AMDGPU/fmaximum.ll | 8 +-
llvm/test/CodeGen/AMDGPU/fmed3.ll | 14 +-
llvm/test/CodeGen/AMDGPU/fminimum.ll | 8 +-
llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll | 588 +-
llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll | 2 +-
llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 2 +-
llvm/test/CodeGen/AMDGPU/fneg.bf16.ll | 69 +-
llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 4 +-
.../CodeGen/AMDGPU/frame-index-elimination.ll | 2 +-
llvm/test/CodeGen/AMDGPU/freeze.ll | 146 +-
llvm/test/CodeGen/AMDGPU/frem.ll | 224 +-
llvm/test/CodeGen/AMDGPU/function-args.ll | 655 +-
.../AMDGPU/gfx-callable-argument-types.ll | 644 +-
.../AMDGPU/gfx11-user-sgpr-init16-bug.ll | 8 +-
.../AMDGPU/global-extload-gfx11plus.ll | 140 +-
llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 167 +-
llvm/test/CodeGen/AMDGPU/global_atomics.ll | 98 +-
llvm/test/CodeGen/AMDGPU/half.ll | 16 +-
llvm/test/CodeGen/AMDGPU/icmp.i16.ll | 300 +-
llvm/test/CodeGen/AMDGPU/idot4s.ll | 29 +-
llvm/test/CodeGen/AMDGPU/idot4u.ll | 173 +-
.../isel-amdgpu-cs-chain-preserve-cc.ll | 22 +-
.../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 12 +-
.../AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll | 14 +-
llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll | 4 +-
llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll | 4 +-
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 244 +-
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 344 +-
llvm/test/CodeGen/AMDGPU/mad.u16.ll | 14 +-
llvm/test/CodeGen/AMDGPU/min.ll | 73 +-
...-to-valu-pseudo-scalar-trans-f16-true16.ll | 50 +-
llvm/test/CodeGen/AMDGPU/offset-split-flat.ll | 4074 +--
.../CodeGen/AMDGPU/offset-split-global.ll | 4032 +--
.../AMDGPU/promote-constOffset-to-imm.ll | 2 +-
llvm/test/CodeGen/AMDGPU/rotl.ll | 12 +-
llvm/test/CodeGen/AMDGPU/rotr.ll | 12 +-
.../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 26 +-
llvm/test/CodeGen/AMDGPU/smed3.ll | 2 +-
llvm/test/CodeGen/AMDGPU/spillv16.ll | 150 +-
llvm/test/CodeGen/AMDGPU/strict_fpext.ll | 2 +-
llvm/test/CodeGen/AMDGPU/sub.ll | 14 +-
llvm/test/CodeGen/AMDGPU/uaddo.ll | 2 +-
llvm/test/CodeGen/AMDGPU/umed3.ll | 2 +-
llvm/test/CodeGen/AMDGPU/usubo.ll | 2 +-
llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 28 +-
llvm/test/CodeGen/AMDGPU/v_pack.ll | 42 +-
.../test/CodeGen/AMDGPU/vector_rebroadcast.ll | 2 +-
.../CodeGen/AMDGPU/vector_shuffle.packed.ll | 25 +-
llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 2 +-
71 files changed, 20206 insertions(+), 32747 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 1038797374de3..08251762c5fb3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -583,6 +583,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
"Use true 16-bit registers"
>;
+def FeatureRealTrueD16Insts : SubtargetFeature<"real-true-d16",
+ "EnableRealTrueD16Insts",
+ "true",
+ "Use D16 instructions with true 16-bit registere"
+>;
+
def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
"HasBF16TransInsts",
"true",
@@ -2557,6 +2563,13 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
+// Use D16 Insts in true16 mode
+def UseRealTrueD16Insts : TrueD16PredicateClass<"Subtarget->useRealTrueD16Insts()">,
+ AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, FeatureRealTrueD16Insts)>;
+def NotUseRealTrueD16Insts : TrueD16PredicateClass<"Subtarget->useRealTrue16Insts() && "
+ "!Subtarget->useRealTrueD16Insts()">,
+ AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts)>;
+
def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPredicateControl.td b/llvm/lib/Target/AMDGPU/AMDGPUPredicateControl.td
index 7c990aa6b2eb6..43479afeb4c3b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPredicateControl.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPredicateControl.td
@@ -16,15 +16,19 @@ def FalsePredicate : Predicate<"false">;
class True16PredicateClass<string cond> : Predicate<cond>;
def NoTrue16Predicate : True16PredicateClass<"">;
+class TrueD16PredicateClass<string cond> : Predicate<cond>;
+def NoTrueD16Predicate : TrueD16PredicateClass<"">;
+
class PredicateControl {
Predicate SubtargetPredicate = TruePredicate;
Predicate AssemblerPredicate = TruePredicate;
Predicate WaveSizePredicate = TruePredicate;
True16PredicateClass True16Predicate = NoTrue16Predicate;
+ TrueD16PredicateClass TrueD16Predicate = NoTrueD16Predicate;
list<Predicate> OtherPredicates = [];
list<Predicate> Predicates =
!foldl(OtherPredicates, [SubtargetPredicate, AssemblerPredicate,
- WaveSizePredicate, True16Predicate],
+ WaveSizePredicate, True16Predicate, TrueD16Predicate],
preds, p,
- preds # !listremove([p], [TruePredicate, NoTrue16Predicate] # preds));
+ preds # !listremove([p], [TruePredicate, NoTrue16Predicate, NoTrueD16Predicate] # preds));
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 73acb1ddbd2a7..0e3524d7856b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -38,6 +38,10 @@ bool AMDGPUSubtarget::useRealTrue16Insts() const {
return hasTrue16BitInsts() && EnableRealTrue16Insts;
}
+bool AMDGPUSubtarget::useRealTrueD16Insts() const {
+ return hasTrue16BitInsts() && useRealTrue16Insts() && EnableRealTrueD16Insts;
+}
+
// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
// allows the given function to achieve an occupancy of NWaves waves per
// SIMD / EU, taking into account only the function's *maximum* workgroup size.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 57b757c990e1a..1f5e4cbc9142e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -59,6 +59,7 @@ class AMDGPUSubtarget {
bool HasCvtPkF16F32Inst = false;
bool HasF32ToF16BF16ConversionSRInsts = false;
bool EnableRealTrue16Insts = false;
+ bool EnableRealTrueD16Insts = false;
bool HasBF16TransInsts = false;
bool HasBF16ConversionInsts = false;
bool HasBF16PackedInsts = false;
@@ -224,6 +225,8 @@ class AMDGPUSubtarget {
// supported and the support for fake True16 instructions is removed.
bool useRealTrue16Insts() const;
+ bool useRealTrueD16Insts() const;
+
bool hasBF16TransInsts() const { return HasBF16TransInsts; }
bool hasBF16ConversionInsts() const {
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 80f0d504ea30c..a1c77df80f024 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1319,6 +1319,11 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCN
(inst $vaddr, $offset)
>;
+class FlatLoadPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (FlatOffset i64:$vaddr, i32:$offset))),
+ (EXTRACT_SUBREG (inst $vaddr, $offset), lo16)
+>;
+
class FlatLoadPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (FlatOffset i64:$vaddr, i32:$offset), (i32 timm:$cpol))),
(inst $vaddr, $offset, $cpol)
@@ -1389,11 +1394,21 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
(inst $vaddr, $offset)
>;
+class FlatLoadSignedPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))),
+ (EXTRACT_SUBREG (inst $vaddr, $offset), lo16)
+>;
+
class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
(inst $saddr, $voffset, $offset, $cpol)
>;
+class FlatLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))),
+ (EXTRACT_SUBREG (inst $saddr, $voffset, $offset, $cpol), lo16)
+>;
+
class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))),
(inst $vaddr, $offset, $cpol)
@@ -1532,6 +1547,11 @@ class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType
(inst $vaddr, $offset)
>;
+class ScratchLoadSignedPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))),
+ (EXTRACT_SUBREG (inst $vaddr, $offset), lo16)
+>;
+
class ScratchLoadSignedPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset), vt:$in),
(inst $vaddr, $offset, 0, $in)
@@ -1552,6 +1572,11 @@ class ScratchLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType v
(inst $saddr, $offset)
>;
+class ScratchLoadSaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset))),
+ (EXTRACT_SUBREG (inst $saddr, $offset), lo16)
+>;
+
class ScratchLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset), vt:$in)),
(inst $saddr, $offset, 0, $in)
@@ -1573,6 +1598,11 @@ class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType
(inst $vaddr, $saddr, $offset, $cpol)
>;
+class ScratchLoadSVaddrPat_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
+ (EXTRACT_SUBREG (inst $vaddr, $saddr, $offset, $cpol), lo16)
+>;
+
class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
(node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol)),
@@ -1619,6 +1649,16 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
}
}
+multiclass GlobalFLATLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadSignedPat_t16 <inst, node, vt> {
+ let AddedComplexity = 10;
+ }
+
+ def : FlatLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 11;
+ }
+}
+
multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat_CPOL<inst, node, vt> {
let AddedComplexity = 10;
@@ -1737,6 +1777,21 @@ multiclass ScratchFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTy
}
}
+multiclass ScratchFLATLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : ScratchLoadSignedPat_t16 <inst, node, vt> {
+ let AddedComplexity = 25;
+ }
+
+ def : ScratchLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 26;
+ }
+
+ def : ScratchLoadSVaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SVS"), node, vt> {
+ let SubtargetPredicate = HasFlatScratchSVSMode;
+ let AddedComplexity = 27;
+ }
+}
+
multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> {
def : ScratchStoreSignedPat <inst, node, vt> {
@@ -1808,6 +1863,15 @@ multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
}
}
+multiclass FlatLoadPats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadPat_t16 <inst, node, vt>;
+
+ def : FlatLoadSaddrPat_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
multiclass FlatLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadPat_D16 <inst, node, vt>;
@@ -1835,15 +1899,6 @@ multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
}
}
-multiclass FlatStorePats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
- def : FlatStorePat <!cast<FLAT_Pseudo>(!cast<string>(inst)#"_t16"), node, vt>;
-
- def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR_t16"), node, vt> {
- let AddedComplexity = 9;
- let SubtargetPredicate = HasFlatGVSMode;
- }
-}
-
let OtherPredicates = [HasFlatAddressSpace] in {
defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
@@ -1878,16 +1933,28 @@ let True16Predicate = p in {
}
let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
- defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
- defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
- defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>;
+ let TrueD16Predicate = UseRealTrueD16Insts in {
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
+ }
+ let TrueD16Predicate = NotUseRealTrueD16Insts in {
+ defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_USHORT, load_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
+ defm : FlatLoadPats_t16 <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
+ }
+ defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+ defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT_t16, atomic_store_16_flat, i16>;
} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
@@ -2027,19 +2094,32 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>;
}
let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", sextloadi8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", load_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_aext_8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_zext_8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", atomic_load_sext_8_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_nonext_16_global, i16>;
-defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_zext_16_global, i16>;
-defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", truncstorei8_global, i16>;
-defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", store_global, i16>;
-defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", atomic_store_8_global, i16>;
-defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", atomic_store_16_global, i16>;
+ let TrueD16Predicate = UseRealTrueD16Insts in {
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", sextloadi8_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", load_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_aext_8_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_zext_8_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", atomic_load_sext_8_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_nonext_16_global, i16>;
+ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_zext_16_global, i16>;
+ }
+ let TrueD16Predicate = NotUseRealTrueD16Insts in {
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, load_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_SBYTE, atomic_load_sext_8_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16>;
+ defm : GlobalFLATLoadPats_t16 <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>;
+ }
+ defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", truncstorei8_global, i16>;
+ defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", store_global, i16>;
+ defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", atomic_store_8_global, i16>;
+ defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", atomic_store_16_global, i16>;
} // end OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts
foreach vt = Reg32Types.types in {
@@ -2264,12 +2344,20 @@ defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i16>;
}
let True16Predicate = UseRealTrue16Insts in {
-defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", extloadi8_private, i16>;
-defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", zextloadi8_private, i16>;
-defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SBYTE_D16", sextloadi8_private, i16>;
-defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SHORT_D16", load_private, i16>;
-defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_SHORT", store_private, i16>;
-defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>;
+ let TrueD16Predicate = UseRealTrueD16Insts in {
+ defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", extloadi8_private, i16>;
+ defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", zextloadi8_private, i16>;
+ defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SBYTE_D16", sextloadi8_private, i16>;
+ defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SHORT_D16", load_private, i16>;
+ }
+ let TrueD16Predicate = NotUseRealTrueD16Insts in {
+ defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_UBYTE, extloadi8_private, i16>;
+ defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_UBYTE, zextloadi8_private, i16>;
+ defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_SBYTE, sextloadi8_private, i16>;
+ defm : ScratchFLATLoadPats_t16 <SCRATCH_LOAD_USHORT, load_private, i16>;
+ }
+ defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_SHORT", store_private, i16>;
+ defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>;
} // End True16Predicate = UseRealTrue16Insts
foreach vt = Reg32Types.types in {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index d03d6a8940b2f..1dc53cec8df85 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -15369,876 +15369,913 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:536
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:532
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:528
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:524
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:520
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:516
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:512
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:508
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:504
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:500
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:496
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:492
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:488
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:484
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:480
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:476
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:472
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:468
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:464
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:460
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:456
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:452
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:448
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:444
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:440
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:436
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:432
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:428
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:424
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:420
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:416
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:412
+; GFX11-TRUE16-NEXT: s_clause 0x4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:408
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:404
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:400
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:396
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:392
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_u16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT: scratch_load_u16 v112, off, s32 offset:380
+; GFX11-TRUE16-NEXT: scratch_load_u16 v32, off, s32 offset:376
+; GFX11-TRUE16-NEXT: scratch_load_u16 v113, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_u16 v56, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_u16 v114, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_u16 v33, off, s32 offset:360
+; GFX11-TRUE16-NEXT: scratch_load_u16 v115, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_u16 v57, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_u16 v116, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_u16 v34, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_u16 v117, off, s32 offset:340
+; GFX11-TRUE16-NEXT: scratch_load_u16 v58, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_u16 v118, off, s32 offset:332
+; GFX11-TRUE16-NEXT: scratch_load_u16 v35, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_u16 v119, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_u16 v59, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_u16 v128, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_u16 v36, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_u16 v129, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_u16 v60, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_u16 v130, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_u16 v37, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_u16 v131, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_u16 v61, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_u16 v132, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_u16 v38, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_u16 v133, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_u16 v62, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_u16 v134, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_u16 v39, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_u16 v135, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_u16 v144, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_u16 v145, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_u16 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_u16 v146, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_u16 v49, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_u16 v147, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_u16 v73, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_u16 v148, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_u16 v50, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_u16 v52, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v53, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v54, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v55, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v64, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v65, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_u16 v66, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_u16 v67, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_u16 v74, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_u16 v75, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_u16 v76, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_u16 v77, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_u16 v78, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_u16 v79, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_u16 v88, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_u16 v89, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_u16 v90, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_u16 v91, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_u16 v92, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_u16 v93, off, s32 offset:152
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_u16 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_u16 v95, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_u16 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_u16 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_u16 v106, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_u16 v107, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_u16 v108, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_u16 v149, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_u16 v150, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_u16 v151, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_u16 v160, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_u16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_u16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_u16 v163, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_u16 v164, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_u16 v165, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_u16 v166, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_u16 v167, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_u16 v176, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_u16 v177, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_u16 v178, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_u16 v179, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_u16 v180, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_u16 v181, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_u16 v182, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_u16 v183, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_u16 v40, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_u16 v41, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_u16 v42, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_u16 v43, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v44, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v45, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
+; GFX11-TRUE16-NEXT: scratch_load_u16 v46, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v47, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v31.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.l, 8, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v56.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v57.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v58.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v59.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v60.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.h, 8, v61.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v62.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v63.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v48.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v72.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v49.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v73.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v51
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v52.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v53.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v54.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v55.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v74.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v75.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v76.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v77.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v78.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v79.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v88.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v89.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v90.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v91.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v92.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v93.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v94.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v95.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v104.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v105.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v106.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v107.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v108.l
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
-; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_4
-; GFX11-TRUE16-NEXT: .LBB14_2: ; %end
-; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v101.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v0.l, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v103.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v102.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v96.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v101, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v1.h, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v100.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v2.l, v100.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v101, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v3.l, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v99.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v101, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v4.l, v96.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v85.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v47.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v46.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v101, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v5.l, v87.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v81.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v45.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v44.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v101, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v6.l, v85.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v43.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v42.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v41.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v101, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v7.l, v83.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v40.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v183.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v182.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v101, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v8.l, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v68.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v67.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v181.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v180.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v101, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v9.l, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v179.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v66.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v178.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v177.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v101, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v10.l, v69.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v65.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v176.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v167.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v166.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v101, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v11.l, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v165.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v164.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v101, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v12.l, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v163.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v162.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v161.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v101, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v13.l, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v160.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v101, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v14.l, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v51.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v149.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v148.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v101, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v15.l, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v50.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v101, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v16.l, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v144.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v101, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v17.l, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v132.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v101, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v18.l, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v101, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v19.l, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v118.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v101, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v20.l, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v35.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v116.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v101, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v21.l, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v113.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v101, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v22.l, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v112.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v101, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v23.l, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v32.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v101, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v24.l, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v101, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v25.l, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v101, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v26.l, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v101, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v27.l, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v101, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v28.l, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v101, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v29.l, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v101, v29
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v30.l, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v101, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v31.l, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v101, v31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: .LBB14_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
-; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_4
+; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v101.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v101.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v98.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v98.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v103.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v102.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v102.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v97.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v96.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v86.h, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v86.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v100.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v100.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v99.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v99.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v84.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v83.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v82.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v82.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v96.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v97.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v87.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v87.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v71.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v71.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v70.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v69.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v85.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v85.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v83.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v84.l, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v47.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v46.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v45.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v44.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v81.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v81.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v80.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v80.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v43.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v42.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v41.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v40.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v69.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v70.l, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v68.l, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v68.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v183.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v182.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v181.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v180.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v67.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v67.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v66.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v66.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v179.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v178.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v177.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v176.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v65.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v65.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v64.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v64.h, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v167.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v166.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v165.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v164.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v55.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v55.h, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v54.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v54.h, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v163.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v162.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v161.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v160.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v53.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v53.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v52.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v52.h, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v151.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v150.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v149.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v148.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v51.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v51.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v50.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v50.h, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v147.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v146.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v144.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v49.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v49.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v48.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v48.h, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v135.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v134.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v133.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v132.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v39.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v39.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v38.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v38.h, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v131.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v130.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v129.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v128.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v37.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v37.h, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v36.l, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v36.h, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v119.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v118.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v117.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v116.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v35.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v35.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v35.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v34.l, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v34.h, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v115.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v114.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v35
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v113.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v33.l, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v33.h, v33.h, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v33.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v30.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
@@ -16246,7 +16283,48 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT: .LBB14_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:392
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:396
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:400
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:404
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:408
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:412
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:416
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:420
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:424
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:428
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:432
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:436
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:440
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:444
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:448
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:452
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:456
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:460
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:464
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:468
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:472
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:476
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:480
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:484
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:488
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:492
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:496
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:500
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:504
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:508
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:512
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:516
+; GFX11-TRUE16-NEXT: s_clause 0x4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:520
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:524
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:528
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:532
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:536
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32:
@@ -20512,1887 +20590,946 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:464
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:460
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:456
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:452
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:448
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:444
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:440
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:436
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:432
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:428
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:424
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:420
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:416
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:412
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:408
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:404
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:400
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:396
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:392
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT: s_clause 0x7
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:320
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v2, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v8, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v10, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v16, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v18, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v20, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v22, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v24, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v26, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v28, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v30, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v94, off, s32 offset:240
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v95, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v104, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v105, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v106, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v107, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v108, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v109, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v110, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v111, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:132
-; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v91
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v73
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v33
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v58
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v56
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v47
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v40
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v179
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v151
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v160
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v112
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v132
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v133
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v102
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v134
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v135
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v144
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v119
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v84
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v128
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v129
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v130
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v80
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v113
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v116
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v68
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v117
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v97
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v64
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v98
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB15_3
-; GFX11-TRUE16-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v54
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v52
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v51
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v89, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v90, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v92, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v93, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v88, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v74, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v75, v12
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v46
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v181
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v78, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v79, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v63, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v72, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v73, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v61, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v43, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v44, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v56
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v45
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v183
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v162
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v145
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v118
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v57, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v58, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v59, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v60, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v40, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v41, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v42, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v160, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v161, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v19, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v166
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v165
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v164
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v148
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v147
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v146
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v100
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v83
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v167, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v176, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v177, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v178, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v150, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v151, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v130, v26
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v131, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v19, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v24, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v112
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v102
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v101
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v86
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v85
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v69
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v133, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v134, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v135, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v119, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v128, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v129, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v117, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v24, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v29, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v81
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v71
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v67
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 3, v66
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v65
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v64
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xff, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v113, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v114, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v115, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v116, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v87, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v96, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v97, v30
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v98, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v99, v32
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v29, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v30
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: .LBB15_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:392
-; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:396
-; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:400
-; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:404
-; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:408
-; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:412
-; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:416
-; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:420
-; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:424
-; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:428
-; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:432
-; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
-; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
-; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT: s_clause 0x7
-; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
-; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
-; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
-; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:460
-; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:464
-; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:468
-; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:472
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:476
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB15_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT: s_branch .LBB15_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260
-; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB15_4
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49
-; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB15_3
-; GFX11-FAKE16-NEXT: .LBB15_2: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v54
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v52
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v51
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v38
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v34
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v90, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v92, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v93, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v88, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v74, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v75, v12
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v50
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v49
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v48
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v39
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v37
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v79, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v63, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v72, v8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v73, v10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v117, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v113, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v34, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v32
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: .LBB15_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT: .LBB15_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT: s_branch .LBB15_2
+; GFX11-LABEL: bitcast_v128i8_to_v32i32_scalar:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:476
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:472
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:468
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:464
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:460
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:456
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:452
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:448
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:444
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:440
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:436
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:432
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:428
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:424
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:420
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:416
+; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:412
+; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:408
+; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:404
+; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:400
+; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:396
+; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:392
+; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:388
+; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:384
+; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:380
+; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:376
+; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:372
+; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:368
+; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:364
+; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:360
+; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:356
+; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:352
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:348
+; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:344
+; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:340
+; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:336
+; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:332
+; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:328
+; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:324
+; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:320
+; GFX11-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
+; GFX11-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
+; GFX11-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
+; GFX11-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
+; GFX11-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
+; GFX11-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
+; GFX11-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:316
+; GFX11-NEXT: scratch_load_u16 v2, off, s32
+; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v16, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v18, off, s32 offset:64
+; GFX11-NEXT: scratch_load_u16 v20, off, s32 offset:72
+; GFX11-NEXT: scratch_load_u16 v22, off, s32 offset:80
+; GFX11-NEXT: scratch_load_u16 v24, off, s32 offset:88
+; GFX11-NEXT: scratch_load_u16 v26, off, s32 offset:96
+; GFX11-NEXT: scratch_load_u16 v28, off, s32 offset:104
+; GFX11-NEXT: scratch_load_u16 v30, off, s32 offset:112
+; GFX11-NEXT: scratch_load_u16 v31, off, s32 offset:120
+; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:128
+; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:136
+; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:144
+; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:152
+; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:160
+; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:168
+; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:176
+; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:184
+; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:192
+; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:200
+; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:208
+; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:216
+; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:224
+; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:232
+; GFX11-NEXT: scratch_load_u16 v94, off, s32 offset:240
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_u16 v95, off, s32 offset:248
+; GFX11-NEXT: scratch_load_u16 v104, off, s32 offset:256
+; GFX11-NEXT: scratch_load_u16 v105, off, s32 offset:264
+; GFX11-NEXT: scratch_load_u16 v106, off, s32 offset:272
+; GFX11-NEXT: scratch_load_u16 v107, off, s32 offset:280
+; GFX11-NEXT: scratch_load_u16 v108, off, s32 offset:288
+; GFX11-NEXT: scratch_load_u16 v109, off, s32 offset:296
+; GFX11-NEXT: scratch_load_u16 v110, off, s32 offset:304
+; GFX11-NEXT: scratch_load_u16 v111, off, s32 offset:312
+; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:308
+; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:300
+; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:292
+; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:284
+; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:276
+; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:268
+; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:260
+; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:252
+; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:244
+; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:236
+; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:228
+; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:220
+; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:212
+; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:204
+; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:196
+; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:188
+; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:180
+; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:172
+; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:164
+; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:156
+; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:148
+; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:140
+; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:132
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_u16 v147, off, s32 offset:124
+; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:116
+; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:108
+; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:100
+; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:92
+; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:84
+; GFX11-NEXT: scratch_load_u16 v166, off, s32 offset:76
+; GFX11-NEXT: scratch_load_u16 v180, off, s32 offset:68
+; GFX11-NEXT: scratch_load_u16 v181, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v182, off, s32 offset:52
+; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v45, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v46, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v47, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v56, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v62, off, s32 offset:4
+; GFX11-NEXT: v_lshlrev_b32_e32 v89, 8, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v90, 8, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v91, 8, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v92, 8, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v93, 8, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v76, 8, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v77, 8, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v78, 8, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v79, 8, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v88, 8, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v63, 8, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v72, 8, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v73, 8, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v74, 8, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v75, 8, v29
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(62)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v57, 8, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v58, 8, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v59, 8, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v60, 8, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v61, 8, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v40, 8, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v41, 8, v14
+; GFX11-NEXT: v_lshlrev_b32_e32 v42, 8, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v43, 8, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v44, 8, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v167, 8, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v176, 8, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v177, 8, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v178, 8, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v179, 8, v30
+; GFX11-NEXT: v_lshlrev_b32_e32 v149, 8, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v150, 8, v87
+; GFX11-NEXT: s_waitcnt vmcnt(61)
+; GFX11-NEXT: v_lshlrev_b32_e32 v151, 8, v96
+; GFX11-NEXT: s_waitcnt vmcnt(60)
+; GFX11-NEXT: v_lshlrev_b32_e32 v160, 8, v97
+; GFX11-NEXT: s_waitcnt vmcnt(59)
+; GFX11-NEXT: v_lshlrev_b32_e32 v161, 8, v98
+; GFX11-NEXT: s_waitcnt vmcnt(58)
+; GFX11-NEXT: v_lshlrev_b32_e32 v132, 8, v99
+; GFX11-NEXT: s_waitcnt vmcnt(57)
+; GFX11-NEXT: v_lshlrev_b32_e32 v133, 8, v113
+; GFX11-NEXT: s_waitcnt vmcnt(56)
+; GFX11-NEXT: v_lshlrev_b32_e32 v134, 8, v114
+; GFX11-NEXT: s_waitcnt vmcnt(55)
+; GFX11-NEXT: v_lshlrev_b32_e32 v135, 8, v115
+; GFX11-NEXT: s_waitcnt vmcnt(54)
+; GFX11-NEXT: v_lshlrev_b32_e32 v144, 8, v116
+; GFX11-NEXT: s_waitcnt vmcnt(53)
+; GFX11-NEXT: v_lshlrev_b32_e32 v119, 8, v117
+; GFX11-NEXT: s_waitcnt vmcnt(52)
+; GFX11-NEXT: v_lshlrev_b32_e32 v128, 8, v128
+; GFX11-NEXT: s_waitcnt vmcnt(51)
+; GFX11-NEXT: v_lshlrev_b32_e32 v129, 8, v129
+; GFX11-NEXT: s_waitcnt vmcnt(50)
+; GFX11-NEXT: v_lshlrev_b32_e32 v130, 8, v130
+; GFX11-NEXT: s_waitcnt vmcnt(49)
+; GFX11-NEXT: v_lshlrev_b32_e32 v131, 8, v131
+; GFX11-NEXT: s_waitcnt vmcnt(48)
+; GFX11-NEXT: v_lshlrev_b32_e32 v113, 8, v94
+; GFX11-NEXT: s_waitcnt vmcnt(47)
+; GFX11-NEXT: v_lshlrev_b32_e32 v114, 8, v95
+; GFX11-NEXT: s_waitcnt vmcnt(46)
+; GFX11-NEXT: v_lshlrev_b32_e32 v115, 8, v104
+; GFX11-NEXT: s_waitcnt vmcnt(45)
+; GFX11-NEXT: v_lshlrev_b32_e32 v116, 8, v105
+; GFX11-NEXT: s_waitcnt vmcnt(44)
+; GFX11-NEXT: v_lshlrev_b32_e32 v117, 8, v106
+; GFX11-NEXT: s_waitcnt vmcnt(43)
+; GFX11-NEXT: v_lshlrev_b32_e32 v87, 8, v107
+; GFX11-NEXT: s_waitcnt vmcnt(42)
+; GFX11-NEXT: v_lshlrev_b32_e32 v96, 8, v108
+; GFX11-NEXT: s_waitcnt vmcnt(41)
+; GFX11-NEXT: v_lshlrev_b32_e32 v97, 8, v109
+; GFX11-NEXT: s_waitcnt vmcnt(40)
+; GFX11-NEXT: v_lshlrev_b32_e32 v98, 8, v110
+; GFX11-NEXT: s_waitcnt vmcnt(39)
+; GFX11-NEXT: v_lshlrev_b32_e32 v99, 8, v111
+; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT: s_cbranch_scc0 .LBB15_4
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v54
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v53
+; GFX11-NEXT: s_and_b32 s5, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s29, 8
+; GFX11-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v90
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v91
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-NEXT: s_and_b32 s11, s26, 0xff
+; GFX11-NEXT: v_or_b32_e32 v5, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v50
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v49
+; GFX11-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v76
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v77
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v39
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v78
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v79
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v8, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v38
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v37
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v88
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v63
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v9, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v72
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v73
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v10, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v33
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v74
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v75
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v32
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v62
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v57
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v58
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v56
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v47
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v59
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v60
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v13, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v45
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v61
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v40
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v14, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v183
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v41
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v42
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v15, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v181
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v180
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v43
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v44
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v16, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v166
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v167
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v176
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v164
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v163
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v177
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v178
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v18, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v148
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v179
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v149
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v19, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v147
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v150
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v151
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v20, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v145
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v160
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v161
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v21, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v112
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v103
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v132
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v133
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v102
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v101
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v134
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v135
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v23, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v100
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v86
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v144
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v119
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v24, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v85
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v84
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v128
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v129
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v25, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v83
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v82
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v130
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v131
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v26, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v81
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v80
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v113
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v114
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v71
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v115
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v116
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v28, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v69
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v68
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v117
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v29, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v67
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v66
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v96
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v97
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v30, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v65
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v64
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v98
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v99
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v31, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v55
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v89
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT: s_lshl_b32 s8, s8, 16
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT: s_or_b32 s9, s9, s10
+; GFX11-NEXT: s_or_b32 s10, s11, s12
+; GFX11-NEXT: s_lshl_b32 s8, s8, 16
+; GFX11-NEXT: s_and_b32 s9, s9, 0xffff
+; GFX11-NEXT: s_lshl_b32 s10, s10, 16
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v51
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v52
+; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v93
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v92
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v6, v2, v3
+; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_vccnz .LBB15_3
+; GFX11-NEXT: .LBB15_2: ; %cmp.true
+; GFX11-NEXT: s_add_i32 s0, s0, 3
+; GFX11-NEXT: s_add_i32 s2, s2, 3
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s2
+; GFX11-NEXT: s_addk_i32 s0, 0x300
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-NEXT: s_add_i32 s16, s16, 3
+; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: s_and_b32 s1, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s2, s17, 8
+; GFX11-NEXT: s_add_i32 s18, s18, 3
+; GFX11-NEXT: s_or_b32 s1, s2, s1
+; GFX11-NEXT: s_and_b32 s2, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s19, 8
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_add_i32 s20, s20, 3
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_and_b32 s3, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s21, 8
+; GFX11-NEXT: s_add_i32 s22, s22, 3
+; GFX11-NEXT: s_or_b32 s1, s1, s2
+; GFX11-NEXT: s_or_b32 s2, s4, s3
+; GFX11-NEXT: s_and_b32 s3, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s23, 8
+; GFX11-NEXT: s_add_i32 s24, s24, 3
+; GFX11-NEXT: s_or_b32 s3, s4, s3
+; GFX11-NEXT: s_and_b32 s4, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s25, 8
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_addk_i32 s3, 0x300
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: s_add_i32 s26, s26, 3
+; GFX11-NEXT: s_or_b32 s2, s2, s3
+; GFX11-NEXT: s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT: s_and_b32 s4, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s27, 8
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v55
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v54
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v52
+; GFX11-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v51
+; GFX11-NEXT: s_or_b32 s3, s3, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v53
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v38
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v34
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v33
+; GFX11-NEXT: v_or_b32_e32 v0, v89, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v90, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v91, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v92, v3
+; GFX11-NEXT: v_or_b32_e32 v4, v93, v4
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: s_add_i32 s28, s28, 3
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT: v_or_b32_e32 v7, v88, v7
+; GFX11-NEXT: v_or_b32_e32 v11, v74, v11
+; GFX11-NEXT: v_or_b32_e32 v12, v75, v12
+; GFX11-NEXT: s_and_b32 s5, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s29, 8
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_or_b32 s5, s6, s5
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: s_addk_i32 s5, 0x300
+; GFX11-NEXT: v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: v_or_b32_e32 v6, v3, v6
+; GFX11-NEXT: v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v50
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v49
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v48
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v39
+; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v37
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v36
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v35
+; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v12
+; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v46
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v181
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v180
+; GFX11-NEXT: v_or_b32_e32 v0, v76, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v77, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v78, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v79, v3
+; GFX11-NEXT: v_or_b32_e32 v7, v63, v7
+; GFX11-NEXT: v_or_b32_e32 v8, v72, v8
+; GFX11-NEXT: v_or_b32_e32 v10, v73, v10
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT: v_or_b32_e32 v12, v61, v12
+; GFX11-NEXT: v_or_b32_e32 v16, v43, v16
+; GFX11-NEXT: v_or_b32_e32 v17, v44, v17
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT: v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v8, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v9, v9, v13
+; GFX11-NEXT: v_or_b32_e32 v10, v14, v10
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v32
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v62
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v56
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v47
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v45
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v183
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v182
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v162
+; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v145
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v118
+; GFX11-NEXT: v_or_b32_e32 v0, v57, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v58, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v59, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v60, v3
+; GFX11-NEXT: v_or_b32_e32 v12, v40, v12
+; GFX11-NEXT: v_or_b32_e32 v13, v41, v13
+; GFX11-NEXT: v_or_b32_e32 v15, v42, v15
+; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21
+; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT: v_or_b32_e32 v17, v179, v17
+; GFX11-NEXT: v_or_b32_e32 v21, v160, v21
+; GFX11-NEXT: v_or_b32_e32 v22, v161, v22
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x300, v21
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT: v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v13, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v14, v14, v18
+; GFX11-NEXT: v_or_b32_e32 v15, v19, v15
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v166
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v165
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v164
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v163
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v148
+; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v147
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v146
+; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18
+; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20
+; GFX11-NEXT: v_or_b32_e32 v21, v21, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v100
+; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v83
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v82
+; GFX11-NEXT: v_or_b32_e32 v0, v167, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v176, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v177, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v178, v3
+; GFX11-NEXT: v_or_b32_e32 v17, v149, v17
+; GFX11-NEXT: v_or_b32_e32 v18, v150, v18
+; GFX11-NEXT: v_or_b32_e32 v20, v151, v20
+; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26
+; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
+; GFX11-NEXT: v_or_b32_e32 v22, v144, v22
+; GFX11-NEXT: v_or_b32_e32 v26, v130, v26
+; GFX11-NEXT: v_or_b32_e32 v27, v131, v27
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v17
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x300, v26
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT: v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v18, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v19, v19, v23
+; GFX11-NEXT: v_or_b32_e32 v20, v24, v20
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v112
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v103
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v102
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v101
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v86
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v85
+; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v84
+; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23
+; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25
+; GFX11-NEXT: v_or_b32_e32 v26, v26, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v69
+; GFX11-NEXT: v_or_b32_e32 v0, v132, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v133, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v134, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v135, v3
+; GFX11-NEXT: v_or_b32_e32 v22, v119, v22
+; GFX11-NEXT: v_or_b32_e32 v23, v128, v23
+; GFX11-NEXT: v_or_b32_e32 v25, v129, v25
+; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x300, v23
+; GFX11-NEXT: v_add_nc_u32_e32 v25, 0x300, v25
+; GFX11-NEXT: v_or_b32_e32 v27, v117, v27
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v22
+; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT: v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v23, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v24, v24, v28
+; GFX11-NEXT: v_or_b32_e32 v25, v29, v25
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v81
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v80
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v71
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v70
+; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v68
+; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v67
+; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v66
+; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v65
+; GFX11-NEXT: v_add_nc_u32_e32 v32, 3, v64
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28
+; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30
+; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31
+; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-NEXT: v_or_b32_e32 v0, v113, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v114, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v115, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v116, v3
+; GFX11-NEXT: v_or_b32_e32 v27, v87, v27
+; GFX11-NEXT: v_or_b32_e32 v28, v96, v28
+; GFX11-NEXT: v_or_b32_e32 v30, v97, v30
+; GFX11-NEXT: v_or_b32_e32 v31, v98, v31
+; GFX11-NEXT: v_or_b32_e32 v32, v99, v32
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x300, v28
+; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x300, v30
+; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x300, v31
+; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x300, v32
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v27
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-NEXT: v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v28, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v29, v29, v33
+; GFX11-NEXT: v_or_b32_e32 v30, v34, v30
+; GFX11-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: .LBB15_3: ; %end
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:320
+; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:324
+; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:328
+; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:332
+; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:336
+; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:340
+; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:344
+; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:348
+; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:352
+; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:356
+; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:360
+; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:364
+; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:368
+; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:372
+; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:376
+; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:380
+; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:384
+; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:388
+; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:392
+; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:396
+; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:400
+; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:404
+; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:408
+; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:412
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:416
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:420
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:424
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:428
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:432
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:436
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:440
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:444
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:448
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:452
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:456
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:460
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:464
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:468
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:472
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:476
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-NEXT: .LBB15_4:
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-NEXT: s_branch .LBB15_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -52038,876 +51175,913 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:536
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:532
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:528
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:524
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:520
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:516
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:512
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:508
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:504
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:500
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:496
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:492
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:488
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:484
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:480
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:476
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:472
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:468
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:464
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:460
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:456
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:452
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:448
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:444
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:440
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:436
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:432
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:428
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:424
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:420
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:416
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:412
+; GFX11-TRUE16-NEXT: s_clause 0x4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:408
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:404
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:400
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:396
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:392
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_u16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT: scratch_load_u16 v112, off, s32 offset:380
+; GFX11-TRUE16-NEXT: scratch_load_u16 v32, off, s32 offset:376
+; GFX11-TRUE16-NEXT: scratch_load_u16 v113, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_u16 v56, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_u16 v114, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_u16 v33, off, s32 offset:360
+; GFX11-TRUE16-NEXT: scratch_load_u16 v115, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_u16 v57, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_u16 v116, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_u16 v34, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_u16 v117, off, s32 offset:340
+; GFX11-TRUE16-NEXT: scratch_load_u16 v58, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_u16 v118, off, s32 offset:332
+; GFX11-TRUE16-NEXT: scratch_load_u16 v35, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_u16 v119, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_u16 v59, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_u16 v128, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_u16 v36, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_u16 v129, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_u16 v60, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_u16 v130, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_u16 v37, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_u16 v131, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_u16 v61, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_u16 v132, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_u16 v38, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_u16 v133, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_u16 v62, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_u16 v134, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_u16 v39, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_u16 v135, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_u16 v144, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_u16 v145, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_u16 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_u16 v146, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_u16 v49, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_u16 v147, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_u16 v73, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_u16 v148, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_u16 v50, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_u16 v52, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v53, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v54, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v55, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v64, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v65, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_u16 v66, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_u16 v67, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_u16 v74, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_u16 v75, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_u16 v76, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_u16 v77, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_u16 v78, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_u16 v79, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_u16 v88, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_u16 v89, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_u16 v90, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_u16 v91, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_u16 v92, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_u16 v93, off, s32 offset:152
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_u16 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_u16 v95, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_u16 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_u16 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_u16 v106, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_u16 v107, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_u16 v108, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_u16 v149, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_u16 v150, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_u16 v151, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_u16 v160, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_u16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_u16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_u16 v163, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_u16 v164, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_u16 v165, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_u16 v166, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_u16 v167, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_u16 v176, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_u16 v177, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_u16 v178, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_u16 v179, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_u16 v180, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_u16 v181, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_u16 v182, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_u16 v183, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_u16 v40, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_u16 v41, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_u16 v42, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_u16 v43, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v44, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v45, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
+; GFX11-TRUE16-NEXT: scratch_load_u16 v46, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v47, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v31.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.l, 8, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v56.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v57.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v58.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v59.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v60.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.h, 8, v61.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v62.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v63.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v48.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v72.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v49.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v73.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v51
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v52.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v53.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v54.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v55.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v74.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v75.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v76.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v77.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v78.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v79.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v88.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v89.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v90.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v91.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v92.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v93.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v94.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v95.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v104.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v105.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v106.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v107.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v108.l
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
-; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_4
-; GFX11-TRUE16-NEXT: .LBB38_2: ; %end
-; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB38_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v101.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v0.l, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v103.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v102.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v96.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v101, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v1.h, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v100.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v2.l, v100.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v101, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v3.l, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v99.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v101, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v4.l, v96.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v85.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v47.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v46.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v101, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v5.l, v87.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v81.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v45.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v44.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v101, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v6.l, v85.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v43.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v42.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v41.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v101, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v7.l, v83.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v40.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v183.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v182.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v101, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v8.l, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v68.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v67.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v181.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v180.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v101, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v9.l, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v179.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v66.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v178.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v177.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v101, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v10.l, v69.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v65.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v176.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v167.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v166.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v101, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v11.l, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v165.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v164.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v101, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v12.l, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v163.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v162.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v161.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v101, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v13.l, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v160.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v101, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v14.l, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v51.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v149.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v148.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v101, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v15.l, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v50.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v101, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v16.l, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v144.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v101, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v17.l, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v132.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v101, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v18.l, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v101, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v19.l, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v118.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v101, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v20.l, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v35.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v116.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v101, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v21.l, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v113.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v101, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v22.l, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v112.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v101, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v23.l, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v32.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v101, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v24.l, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v101, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v25.l, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v101, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v26.l, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v101, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v27.l, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v101, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v28.l, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v101, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v29.l, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v101, v29
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v30.l, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v101, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v31.l, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v101, v31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: .LBB38_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2
-; GFX11-TRUE16-NEXT: .LBB38_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_4
+; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v101.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v101.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v98.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v98.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v103.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v102.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v102.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v97.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v96.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v86.h, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v86.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v100.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v100.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v99.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v99.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v84.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v83.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v82.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v82.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v96.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v97.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v87.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v87.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v71.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v71.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v70.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v69.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v85.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v85.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v83.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v84.l, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v47.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v46.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v45.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v44.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v81.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v81.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v80.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v80.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v43.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v42.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v41.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v40.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v69.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v70.l, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v68.l, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v68.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v183.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v182.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v181.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v180.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v67.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v67.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v66.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v66.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v179.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v178.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v177.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v176.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v65.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v65.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v64.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v64.h, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v167.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v166.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v165.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v164.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v55.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v55.h, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v54.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v54.h, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v163.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v162.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v161.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v160.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v53.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v53.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v52.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v52.h, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v151.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v150.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v149.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v148.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v51.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v51.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v50.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v50.h, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v147.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v146.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v144.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v49.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v49.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v48.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v48.h, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v135.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v134.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v133.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v132.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v39.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v39.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v38.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v38.h, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v131.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v130.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v129.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v128.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v37.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v37.h, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v36.l, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v36.h, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v119.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v118.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v117.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v116.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v35.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v35.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v35.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v34.l, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v34.h, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v115.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v114.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v35
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v113.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v33.l, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v33.h, v33.h, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v33.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v30.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
@@ -52915,7 +52089,48 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT: .LBB38_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:392
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:396
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:400
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:404
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:408
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:412
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:416
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:420
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:424
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:428
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:432
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:436
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:440
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:444
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:448
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:452
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:456
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:460
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:464
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:468
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:472
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:476
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:480
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:484
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:488
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:492
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:496
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:500
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:504
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:508
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:512
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:516
+; GFX11-TRUE16-NEXT: s_clause 0x4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:520
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:524
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:528
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:532
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:536
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32:
@@ -57181,1887 +56396,946 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:464
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:460
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:456
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:452
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:448
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:444
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:440
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:436
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:432
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:428
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:424
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:420
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:416
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:412
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:408
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:404
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:400
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:396
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:392
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT: s_clause 0x7
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:320
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v2, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v8, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v10, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v16, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v18, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v20, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v22, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v24, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v26, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v28, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v30, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v94, off, s32 offset:240
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v95, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v104, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v105, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v106, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v107, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v108, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v109, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v110, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v111, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:132
-; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB39_4
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v91
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v73
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v33
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v58
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v56
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v47
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v40
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v179
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v151
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v160
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v112
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v132
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v133
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v102
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v134
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v135
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v144
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v119
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v84
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v128
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v129
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v130
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v80
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v113
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v116
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v68
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v117
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v97
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v64
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v98
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB39_3
-; GFX11-TRUE16-NEXT: .LBB39_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v54
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v52
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v51
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v89, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v90, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v92, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v93, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v88, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v74, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v75, v12
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v46
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v181
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v78, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v79, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v63, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v72, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v73, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v61, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v43, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v44, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v56
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v45
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v183
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v162
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v145
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v118
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v57, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v58, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v59, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v60, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v40, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v41, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v42, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v160, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v161, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v19, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v166
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v165
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v164
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v148
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v147
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v146
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v100
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v83
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v167, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v176, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v177, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v178, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v150, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v151, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v130, v26
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v131, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v19, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v24, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v112
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v102
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v101
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v86
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v85
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v69
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v133, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v134, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v135, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v119, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v128, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v129, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v117, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v24, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v29, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v81
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v71
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v67
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 3, v66
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v65
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v64
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xff, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v113, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v114, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v115, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v116, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v87, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v96, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v97, v30
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v98, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v99, v32
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v29, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v30
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: .LBB39_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:392
-; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:396
-; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:400
-; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:404
-; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:408
-; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:412
-; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:416
-; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:420
-; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:424
-; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:428
-; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:432
-; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
-; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
-; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT: s_clause 0x7
-; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
-; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
-; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
-; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:460
-; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:464
-; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:468
-; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:472
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:476
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB39_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT: s_branch .LBB39_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260
-; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB39_4
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49
-; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB39_3
-; GFX11-FAKE16-NEXT: .LBB39_2: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v54
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v52
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v51
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v38
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v34
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v90, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v92, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v93, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v88, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v74, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v75, v12
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v50
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v49
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v48
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v39
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v37
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v79, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v63, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v72, v8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v73, v10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v117, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v113, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v34, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v32
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: .LBB39_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT: .LBB39_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT: s_branch .LBB39_2
+; GFX11-LABEL: bitcast_v128i8_to_v32f32_scalar:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:476
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:472
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:468
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:464
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:460
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:456
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:452
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:448
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:444
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:440
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:436
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:432
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:428
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:424
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:420
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:416
+; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:412
+; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:408
+; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:404
+; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:400
+; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:396
+; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:392
+; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:388
+; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:384
+; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:380
+; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:376
+; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:372
+; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:368
+; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:364
+; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:360
+; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:356
+; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:352
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:348
+; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:344
+; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:340
+; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:336
+; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:332
+; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:328
+; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:324
+; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:320
+; GFX11-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
+; GFX11-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
+; GFX11-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
+; GFX11-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
+; GFX11-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
+; GFX11-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
+; GFX11-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:316
+; GFX11-NEXT: scratch_load_u16 v2, off, s32
+; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v16, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v18, off, s32 offset:64
+; GFX11-NEXT: scratch_load_u16 v20, off, s32 offset:72
+; GFX11-NEXT: scratch_load_u16 v22, off, s32 offset:80
+; GFX11-NEXT: scratch_load_u16 v24, off, s32 offset:88
+; GFX11-NEXT: scratch_load_u16 v26, off, s32 offset:96
+; GFX11-NEXT: scratch_load_u16 v28, off, s32 offset:104
+; GFX11-NEXT: scratch_load_u16 v30, off, s32 offset:112
+; GFX11-NEXT: scratch_load_u16 v31, off, s32 offset:120
+; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:128
+; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:136
+; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:144
+; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:152
+; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:160
+; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:168
+; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:176
+; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:184
+; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:192
+; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:200
+; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:208
+; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:216
+; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:224
+; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:232
+; GFX11-NEXT: scratch_load_u16 v94, off, s32 offset:240
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_u16 v95, off, s32 offset:248
+; GFX11-NEXT: scratch_load_u16 v104, off, s32 offset:256
+; GFX11-NEXT: scratch_load_u16 v105, off, s32 offset:264
+; GFX11-NEXT: scratch_load_u16 v106, off, s32 offset:272
+; GFX11-NEXT: scratch_load_u16 v107, off, s32 offset:280
+; GFX11-NEXT: scratch_load_u16 v108, off, s32 offset:288
+; GFX11-NEXT: scratch_load_u16 v109, off, s32 offset:296
+; GFX11-NEXT: scratch_load_u16 v110, off, s32 offset:304
+; GFX11-NEXT: scratch_load_u16 v111, off, s32 offset:312
+; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:308
+; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:300
+; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:292
+; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:284
+; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:276
+; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:268
+; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:260
+; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:252
+; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:244
+; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:236
+; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:228
+; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:220
+; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:212
+; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:204
+; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:196
+; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:188
+; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:180
+; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:172
+; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:164
+; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:156
+; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:148
+; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:140
+; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:132
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_u16 v147, off, s32 offset:124
+; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:116
+; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:108
+; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:100
+; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:92
+; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:84
+; GFX11-NEXT: scratch_load_u16 v166, off, s32 offset:76
+; GFX11-NEXT: scratch_load_u16 v180, off, s32 offset:68
+; GFX11-NEXT: scratch_load_u16 v181, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v182, off, s32 offset:52
+; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v45, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v46, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v47, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v56, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v62, off, s32 offset:4
+; GFX11-NEXT: v_lshlrev_b32_e32 v89, 8, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v90, 8, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v91, 8, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v92, 8, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v93, 8, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v76, 8, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v77, 8, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v78, 8, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v79, 8, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v88, 8, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v63, 8, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v72, 8, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v73, 8, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v74, 8, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v75, 8, v29
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(62)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v57, 8, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v58, 8, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v59, 8, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v60, 8, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v61, 8, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v40, 8, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v41, 8, v14
+; GFX11-NEXT: v_lshlrev_b32_e32 v42, 8, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v43, 8, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v44, 8, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v167, 8, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v176, 8, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v177, 8, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v178, 8, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v179, 8, v30
+; GFX11-NEXT: v_lshlrev_b32_e32 v149, 8, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v150, 8, v87
+; GFX11-NEXT: s_waitcnt vmcnt(61)
+; GFX11-NEXT: v_lshlrev_b32_e32 v151, 8, v96
+; GFX11-NEXT: s_waitcnt vmcnt(60)
+; GFX11-NEXT: v_lshlrev_b32_e32 v160, 8, v97
+; GFX11-NEXT: s_waitcnt vmcnt(59)
+; GFX11-NEXT: v_lshlrev_b32_e32 v161, 8, v98
+; GFX11-NEXT: s_waitcnt vmcnt(58)
+; GFX11-NEXT: v_lshlrev_b32_e32 v132, 8, v99
+; GFX11-NEXT: s_waitcnt vmcnt(57)
+; GFX11-NEXT: v_lshlrev_b32_e32 v133, 8, v113
+; GFX11-NEXT: s_waitcnt vmcnt(56)
+; GFX11-NEXT: v_lshlrev_b32_e32 v134, 8, v114
+; GFX11-NEXT: s_waitcnt vmcnt(55)
+; GFX11-NEXT: v_lshlrev_b32_e32 v135, 8, v115
+; GFX11-NEXT: s_waitcnt vmcnt(54)
+; GFX11-NEXT: v_lshlrev_b32_e32 v144, 8, v116
+; GFX11-NEXT: s_waitcnt vmcnt(53)
+; GFX11-NEXT: v_lshlrev_b32_e32 v119, 8, v117
+; GFX11-NEXT: s_waitcnt vmcnt(52)
+; GFX11-NEXT: v_lshlrev_b32_e32 v128, 8, v128
+; GFX11-NEXT: s_waitcnt vmcnt(51)
+; GFX11-NEXT: v_lshlrev_b32_e32 v129, 8, v129
+; GFX11-NEXT: s_waitcnt vmcnt(50)
+; GFX11-NEXT: v_lshlrev_b32_e32 v130, 8, v130
+; GFX11-NEXT: s_waitcnt vmcnt(49)
+; GFX11-NEXT: v_lshlrev_b32_e32 v131, 8, v131
+; GFX11-NEXT: s_waitcnt vmcnt(48)
+; GFX11-NEXT: v_lshlrev_b32_e32 v113, 8, v94
+; GFX11-NEXT: s_waitcnt vmcnt(47)
+; GFX11-NEXT: v_lshlrev_b32_e32 v114, 8, v95
+; GFX11-NEXT: s_waitcnt vmcnt(46)
+; GFX11-NEXT: v_lshlrev_b32_e32 v115, 8, v104
+; GFX11-NEXT: s_waitcnt vmcnt(45)
+; GFX11-NEXT: v_lshlrev_b32_e32 v116, 8, v105
+; GFX11-NEXT: s_waitcnt vmcnt(44)
+; GFX11-NEXT: v_lshlrev_b32_e32 v117, 8, v106
+; GFX11-NEXT: s_waitcnt vmcnt(43)
+; GFX11-NEXT: v_lshlrev_b32_e32 v87, 8, v107
+; GFX11-NEXT: s_waitcnt vmcnt(42)
+; GFX11-NEXT: v_lshlrev_b32_e32 v96, 8, v108
+; GFX11-NEXT: s_waitcnt vmcnt(41)
+; GFX11-NEXT: v_lshlrev_b32_e32 v97, 8, v109
+; GFX11-NEXT: s_waitcnt vmcnt(40)
+; GFX11-NEXT: v_lshlrev_b32_e32 v98, 8, v110
+; GFX11-NEXT: s_waitcnt vmcnt(39)
+; GFX11-NEXT: v_lshlrev_b32_e32 v99, 8, v111
+; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT: s_cbranch_scc0 .LBB39_4
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v54
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v53
+; GFX11-NEXT: s_and_b32 s5, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s29, 8
+; GFX11-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v90
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v91
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-NEXT: s_and_b32 s11, s26, 0xff
+; GFX11-NEXT: v_or_b32_e32 v5, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v50
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v49
+; GFX11-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v76
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v77
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v39
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v78
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v79
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v8, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v38
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v37
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v88
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v63
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v9, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v72
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v73
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v10, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v33
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v74
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v75
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v32
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v62
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v57
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v58
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v56
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v47
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v59
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v60
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v13, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v45
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v61
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v40
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v14, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v183
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v41
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v42
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v15, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v181
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v180
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v43
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v44
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v16, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v166
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v167
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v176
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v164
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v163
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v177
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v178
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v18, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v148
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v179
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v149
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v19, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v147
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v150
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v151
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v20, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v145
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v160
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v161
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v21, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v112
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v103
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v132
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v133
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v102
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v101
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v134
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v135
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v23, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v100
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v86
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v144
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v119
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v24, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v85
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v84
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v128
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v129
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v25, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v83
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v82
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v130
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v131
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v26, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v81
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v80
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v113
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v114
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v71
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v115
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v116
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v28, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v69
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v68
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v117
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v29, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v67
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v66
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v96
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v97
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v30, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v65
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v64
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v98
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v99
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v31, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v55
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v89
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT: s_lshl_b32 s8, s8, 16
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT: s_or_b32 s9, s9, s10
+; GFX11-NEXT: s_or_b32 s10, s11, s12
+; GFX11-NEXT: s_lshl_b32 s8, s8, 16
+; GFX11-NEXT: s_and_b32 s9, s9, 0xffff
+; GFX11-NEXT: s_lshl_b32 s10, s10, 16
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v51
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v52
+; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v93
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v92
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v6, v2, v3
+; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_vccnz .LBB39_3
+; GFX11-NEXT: .LBB39_2: ; %cmp.true
+; GFX11-NEXT: s_add_i32 s0, s0, 3
+; GFX11-NEXT: s_add_i32 s2, s2, 3
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s2
+; GFX11-NEXT: s_addk_i32 s0, 0x300
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-NEXT: s_add_i32 s16, s16, 3
+; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: s_and_b32 s1, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s2, s17, 8
+; GFX11-NEXT: s_add_i32 s18, s18, 3
+; GFX11-NEXT: s_or_b32 s1, s2, s1
+; GFX11-NEXT: s_and_b32 s2, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s19, 8
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_add_i32 s20, s20, 3
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_and_b32 s3, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s21, 8
+; GFX11-NEXT: s_add_i32 s22, s22, 3
+; GFX11-NEXT: s_or_b32 s1, s1, s2
+; GFX11-NEXT: s_or_b32 s2, s4, s3
+; GFX11-NEXT: s_and_b32 s3, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s23, 8
+; GFX11-NEXT: s_add_i32 s24, s24, 3
+; GFX11-NEXT: s_or_b32 s3, s4, s3
+; GFX11-NEXT: s_and_b32 s4, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s25, 8
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_addk_i32 s3, 0x300
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: s_add_i32 s26, s26, 3
+; GFX11-NEXT: s_or_b32 s2, s2, s3
+; GFX11-NEXT: s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT: s_and_b32 s4, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s27, 8
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v55
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v54
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v52
+; GFX11-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v51
+; GFX11-NEXT: s_or_b32 s3, s3, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v53
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v38
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v34
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v33
+; GFX11-NEXT: v_or_b32_e32 v0, v89, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v90, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v91, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v92, v3
+; GFX11-NEXT: v_or_b32_e32 v4, v93, v4
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: s_add_i32 s28, s28, 3
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT: v_or_b32_e32 v7, v88, v7
+; GFX11-NEXT: v_or_b32_e32 v11, v74, v11
+; GFX11-NEXT: v_or_b32_e32 v12, v75, v12
+; GFX11-NEXT: s_and_b32 s5, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s29, 8
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_or_b32 s5, s6, s5
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: s_addk_i32 s5, 0x300
+; GFX11-NEXT: v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: v_or_b32_e32 v6, v3, v6
+; GFX11-NEXT: v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v50
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v49
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v48
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v39
+; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v37
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v36
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v35
+; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v12
+; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v46
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v181
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v180
+; GFX11-NEXT: v_or_b32_e32 v0, v76, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v77, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v78, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v79, v3
+; GFX11-NEXT: v_or_b32_e32 v7, v63, v7
+; GFX11-NEXT: v_or_b32_e32 v8, v72, v8
+; GFX11-NEXT: v_or_b32_e32 v10, v73, v10
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT: v_or_b32_e32 v12, v61, v12
+; GFX11-NEXT: v_or_b32_e32 v16, v43, v16
+; GFX11-NEXT: v_or_b32_e32 v17, v44, v17
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT: v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v8, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v9, v9, v13
+; GFX11-NEXT: v_or_b32_e32 v10, v14, v10
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v32
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v62
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v56
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v47
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v45
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v183
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v182
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v162
+; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v145
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v118
+; GFX11-NEXT: v_or_b32_e32 v0, v57, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v58, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v59, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v60, v3
+; GFX11-NEXT: v_or_b32_e32 v12, v40, v12
+; GFX11-NEXT: v_or_b32_e32 v13, v41, v13
+; GFX11-NEXT: v_or_b32_e32 v15, v42, v15
+; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21
+; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT: v_or_b32_e32 v17, v179, v17
+; GFX11-NEXT: v_or_b32_e32 v21, v160, v21
+; GFX11-NEXT: v_or_b32_e32 v22, v161, v22
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x300, v21
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT: v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v13, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v14, v14, v18
+; GFX11-NEXT: v_or_b32_e32 v15, v19, v15
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v166
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v165
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v164
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v163
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v148
+; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v147
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v146
+; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18
+; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20
+; GFX11-NEXT: v_or_b32_e32 v21, v21, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v100
+; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v83
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v82
+; GFX11-NEXT: v_or_b32_e32 v0, v167, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v176, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v177, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v178, v3
+; GFX11-NEXT: v_or_b32_e32 v17, v149, v17
+; GFX11-NEXT: v_or_b32_e32 v18, v150, v18
+; GFX11-NEXT: v_or_b32_e32 v20, v151, v20
+; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26
+; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
+; GFX11-NEXT: v_or_b32_e32 v22, v144, v22
+; GFX11-NEXT: v_or_b32_e32 v26, v130, v26
+; GFX11-NEXT: v_or_b32_e32 v27, v131, v27
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v17
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x300, v26
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT: v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v18, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v19, v19, v23
+; GFX11-NEXT: v_or_b32_e32 v20, v24, v20
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v112
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v103
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v102
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v101
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v86
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v85
+; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v84
+; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23
+; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25
+; GFX11-NEXT: v_or_b32_e32 v26, v26, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v69
+; GFX11-NEXT: v_or_b32_e32 v0, v132, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v133, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v134, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v135, v3
+; GFX11-NEXT: v_or_b32_e32 v22, v119, v22
+; GFX11-NEXT: v_or_b32_e32 v23, v128, v23
+; GFX11-NEXT: v_or_b32_e32 v25, v129, v25
+; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x300, v23
+; GFX11-NEXT: v_add_nc_u32_e32 v25, 0x300, v25
+; GFX11-NEXT: v_or_b32_e32 v27, v117, v27
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v22
+; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT: v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v23, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v24, v24, v28
+; GFX11-NEXT: v_or_b32_e32 v25, v29, v25
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v81
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v80
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v71
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v70
+; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v68
+; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v67
+; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v66
+; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v65
+; GFX11-NEXT: v_add_nc_u32_e32 v32, 3, v64
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28
+; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30
+; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31
+; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-NEXT: v_or_b32_e32 v0, v113, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v114, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v115, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v116, v3
+; GFX11-NEXT: v_or_b32_e32 v27, v87, v27
+; GFX11-NEXT: v_or_b32_e32 v28, v96, v28
+; GFX11-NEXT: v_or_b32_e32 v30, v97, v30
+; GFX11-NEXT: v_or_b32_e32 v31, v98, v31
+; GFX11-NEXT: v_or_b32_e32 v32, v99, v32
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x300, v28
+; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x300, v30
+; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x300, v31
+; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x300, v32
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v27
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-NEXT: v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v28, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v29, v29, v33
+; GFX11-NEXT: v_or_b32_e32 v30, v34, v30
+; GFX11-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: .LBB39_3: ; %end
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:320
+; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:324
+; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:328
+; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:332
+; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:336
+; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:340
+; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:344
+; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:348
+; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:352
+; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:356
+; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:360
+; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:364
+; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:368
+; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:372
+; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:376
+; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:380
+; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:384
+; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:388
+; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:392
+; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:396
+; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:400
+; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:404
+; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:408
+; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:412
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:416
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:420
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:424
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:428
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:432
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:436
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:440
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:444
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:448
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:452
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:456
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:460
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:464
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:468
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:472
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:476
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-NEXT: .LBB39_4:
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-NEXT: s_branch .LBB39_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -86760,876 +85034,913 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:536
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:532
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:528
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:524
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:520
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:516
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:512
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:508
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:504
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:500
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:496
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:492
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:488
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:484
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:480
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:476
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:472
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:468
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:464
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:460
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:456
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:452
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:448
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:444
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:440
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:436
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:432
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:428
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:424
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:420
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:416
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:412
+; GFX11-TRUE16-NEXT: s_clause 0x4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:408
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:404
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:400
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:396
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:392
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_u16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT: scratch_load_u16 v112, off, s32 offset:380
+; GFX11-TRUE16-NEXT: scratch_load_u16 v32, off, s32 offset:376
+; GFX11-TRUE16-NEXT: scratch_load_u16 v113, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_u16 v56, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_u16 v114, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_u16 v33, off, s32 offset:360
+; GFX11-TRUE16-NEXT: scratch_load_u16 v115, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_u16 v57, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_u16 v116, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_u16 v34, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_u16 v117, off, s32 offset:340
+; GFX11-TRUE16-NEXT: scratch_load_u16 v58, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_u16 v118, off, s32 offset:332
+; GFX11-TRUE16-NEXT: scratch_load_u16 v35, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_u16 v119, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_u16 v59, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_u16 v128, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_u16 v36, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_u16 v129, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_u16 v60, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_u16 v130, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_u16 v37, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_u16 v131, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_u16 v61, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_u16 v132, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_u16 v38, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_u16 v133, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_u16 v62, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_u16 v134, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_u16 v39, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_u16 v135, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_u16 v144, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_u16 v145, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_u16 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_u16 v146, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_u16 v49, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_u16 v147, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_u16 v73, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_u16 v148, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_u16 v50, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_u16 v52, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v53, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v54, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v55, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v64, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v65, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_u16 v66, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_u16 v67, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_u16 v74, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_u16 v75, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_u16 v76, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_u16 v77, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_u16 v78, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_u16 v79, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_u16 v88, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_u16 v89, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_u16 v90, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_u16 v91, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_u16 v92, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_u16 v93, off, s32 offset:152
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_u16 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_u16 v95, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_u16 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_u16 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_u16 v106, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_u16 v107, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_u16 v108, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_u16 v149, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_u16 v150, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_u16 v151, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_u16 v160, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_u16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_u16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_u16 v163, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_u16 v164, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_u16 v165, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_u16 v166, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_u16 v167, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_u16 v176, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_u16 v177, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_u16 v178, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_u16 v179, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_u16 v180, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_u16 v181, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_u16 v182, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_u16 v183, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_u16 v40, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_u16 v41, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_u16 v42, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_u16 v43, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v44, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v45, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
+; GFX11-TRUE16-NEXT: scratch_load_u16 v46, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v47, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v31.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.l, 8, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v56.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v57.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v58.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v59.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v60.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.h, 8, v61.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v62.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v63.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v48.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v72.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v49.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v73.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v51
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v52.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v53.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v54.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v55.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v74.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v75.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v76.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v77.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v78.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v79.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v88.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v89.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v90.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v91.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v92.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v93.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v94.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v95.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v104.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v105.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v106.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v107.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v108.l
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
-; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_4
-; GFX11-TRUE16-NEXT: .LBB58_2: ; %end
-; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v101.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v0.l, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v103.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v102.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v96.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v101, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v1.h, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v100.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v2.l, v100.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v101, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v3.l, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v99.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v101, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v4.l, v96.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v85.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v47.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v46.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v101, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v5.l, v87.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v81.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v45.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v44.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v101, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v6.l, v85.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v43.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v42.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v41.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v101, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v7.l, v83.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v40.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v183.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v182.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v101, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v8.l, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v68.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v67.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v181.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v180.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v101, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v9.l, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v179.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v66.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v178.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v177.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v101, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v10.l, v69.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v65.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v176.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v167.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v166.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v101, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v11.l, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v165.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v164.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v101, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v12.l, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v163.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v162.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v161.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v101, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v13.l, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v160.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v101, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v14.l, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v51.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v149.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v148.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v101, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v15.l, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v50.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v101, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v16.l, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v144.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v101, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v17.l, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v132.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v101, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v18.l, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v101, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v19.l, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v118.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v101, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v20.l, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v35.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v116.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v101, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v21.l, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v113.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v101, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v22.l, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v112.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v101, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v23.l, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v32.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v101, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v24.l, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v101, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v25.l, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v101, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v26.l, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v101, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v27.l, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v101, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v28.l, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v101, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v29.l, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v101, v29
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v30.l, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v101, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v31.l, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v101, v31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: .LBB58_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2
-; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_4
+; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v101.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v101.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v98.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v98.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v103.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v102.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v102.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v97.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v96.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v86.h, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v86.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v100.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v100.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v99.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v99.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v84.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v83.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v82.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v82.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v96.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v97.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v87.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v87.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v71.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v71.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v70.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v69.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v85.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v85.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v83.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v84.l, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v47.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v46.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v45.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v44.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v81.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v81.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v80.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v80.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v43.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v42.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v41.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v40.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v69.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v70.l, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v68.l, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v68.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v183.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v182.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v181.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v180.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v67.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v67.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v66.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v66.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v179.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v178.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v177.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v176.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v65.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v65.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v64.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v64.h, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v167.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v166.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v165.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v164.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v55.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v55.h, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v54.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v54.h, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v163.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v162.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v161.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v160.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v53.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v53.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v52.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v52.h, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v151.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v150.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v149.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v148.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v51.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v51.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v50.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v50.h, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v147.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v146.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v144.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v49.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v49.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v48.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v48.h, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v135.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v134.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v133.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v132.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v39.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v39.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v38.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v38.h, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v131.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v130.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v129.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v128.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v37.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v37.h, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v36.l, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v36.h, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v119.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v118.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v117.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v116.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v35.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v35.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v35.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v34.l, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v34.h, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v115.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v114.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v35
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v113.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v33.l, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v33.h, v33.h, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v33.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v30.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
@@ -87637,7 +85948,48 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT: .LBB58_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:392
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:396
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:400
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:404
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:408
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:412
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:416
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:420
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:424
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:428
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:432
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:436
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:440
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:444
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:448
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:452
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:456
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:460
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:464
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:468
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:472
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:476
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:480
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:484
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:488
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:492
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:496
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:500
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:504
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:508
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:512
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:516
+; GFX11-TRUE16-NEXT: s_clause 0x4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:520
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:524
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:528
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:532
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:536
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64:
@@ -91903,1887 +90255,946 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:464
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:460
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:456
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:452
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:448
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:444
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:440
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:436
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:432
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:428
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:424
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:420
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:416
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:412
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:408
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:404
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:400
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:396
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:392
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT: s_clause 0x7
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:320
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v2, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v8, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v10, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v16, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v18, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v20, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v22, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v24, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v26, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v28, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v30, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v94, off, s32 offset:240
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v95, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v104, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v105, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v106, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v107, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v108, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v109, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v110, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v111, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:132
-; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_4
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v91
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v73
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v33
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v58
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v56
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v47
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v40
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v179
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v151
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v160
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v112
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v132
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v133
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v102
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v134
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v135
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v144
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v119
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v84
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v128
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v129
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v130
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v80
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v113
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v116
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v68
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v117
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v97
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v64
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v98
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB59_3
-; GFX11-TRUE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v54
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v52
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v51
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v89, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v90, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v92, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v93, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v88, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v74, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v75, v12
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v46
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v181
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v78, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v79, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v63, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v72, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v73, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v61, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v43, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v44, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v56
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v45
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v183
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v162
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v145
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v118
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v57, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v58, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v59, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v60, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v40, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v41, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v42, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v160, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v161, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v19, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v166
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v165
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v164
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v148
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v147
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v146
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v100
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v83
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v167, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v176, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v177, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v178, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v150, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v151, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v130, v26
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v131, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v19, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v24, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v112
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v102
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v101
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v86
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v85
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v69
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v133, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v134, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v135, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v119, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v128, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v129, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v117, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v24, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v29, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v81
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v71
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v67
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 3, v66
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v65
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v64
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xff, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v113, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v114, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v115, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v116, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v87, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v96, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v97, v30
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v98, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v99, v32
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v29, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v30
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: .LBB59_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:392
-; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:396
-; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:400
-; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:404
-; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:408
-; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:412
-; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:416
-; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:420
-; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:424
-; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:428
-; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:432
-; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
-; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
-; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT: s_clause 0x7
-; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
-; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
-; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
-; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:460
-; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:464
-; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:468
-; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:472
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:476
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB59_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT: s_branch .LBB59_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260
-; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB59_4
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49
-; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB59_3
-; GFX11-FAKE16-NEXT: .LBB59_2: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v54
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v52
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v51
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v38
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v34
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v90, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v92, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v93, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v88, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v74, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v75, v12
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v50
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v49
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v48
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v39
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v37
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v79, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v63, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v72, v8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v73, v10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v117, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v113, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v34, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v32
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: .LBB59_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT: .LBB59_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT: s_branch .LBB59_2
+; GFX11-LABEL: bitcast_v128i8_to_v16i64_scalar:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:476
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:472
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:468
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:464
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:460
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:456
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:452
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:448
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:444
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:440
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:436
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:432
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:428
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:424
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:420
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:416
+; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:412
+; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:408
+; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:404
+; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:400
+; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:396
+; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:392
+; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:388
+; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:384
+; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:380
+; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:376
+; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:372
+; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:368
+; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:364
+; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:360
+; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:356
+; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:352
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:348
+; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:344
+; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:340
+; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:336
+; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:332
+; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:328
+; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:324
+; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:320
+; GFX11-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
+; GFX11-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
+; GFX11-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
+; GFX11-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
+; GFX11-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
+; GFX11-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
+; GFX11-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:316
+; GFX11-NEXT: scratch_load_u16 v2, off, s32
+; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v16, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v18, off, s32 offset:64
+; GFX11-NEXT: scratch_load_u16 v20, off, s32 offset:72
+; GFX11-NEXT: scratch_load_u16 v22, off, s32 offset:80
+; GFX11-NEXT: scratch_load_u16 v24, off, s32 offset:88
+; GFX11-NEXT: scratch_load_u16 v26, off, s32 offset:96
+; GFX11-NEXT: scratch_load_u16 v28, off, s32 offset:104
+; GFX11-NEXT: scratch_load_u16 v30, off, s32 offset:112
+; GFX11-NEXT: scratch_load_u16 v31, off, s32 offset:120
+; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:128
+; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:136
+; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:144
+; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:152
+; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:160
+; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:168
+; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:176
+; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:184
+; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:192
+; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:200
+; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:208
+; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:216
+; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:224
+; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:232
+; GFX11-NEXT: scratch_load_u16 v94, off, s32 offset:240
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_u16 v95, off, s32 offset:248
+; GFX11-NEXT: scratch_load_u16 v104, off, s32 offset:256
+; GFX11-NEXT: scratch_load_u16 v105, off, s32 offset:264
+; GFX11-NEXT: scratch_load_u16 v106, off, s32 offset:272
+; GFX11-NEXT: scratch_load_u16 v107, off, s32 offset:280
+; GFX11-NEXT: scratch_load_u16 v108, off, s32 offset:288
+; GFX11-NEXT: scratch_load_u16 v109, off, s32 offset:296
+; GFX11-NEXT: scratch_load_u16 v110, off, s32 offset:304
+; GFX11-NEXT: scratch_load_u16 v111, off, s32 offset:312
+; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:308
+; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:300
+; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:292
+; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:284
+; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:276
+; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:268
+; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:260
+; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:252
+; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:244
+; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:236
+; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:228
+; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:220
+; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:212
+; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:204
+; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:196
+; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:188
+; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:180
+; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:172
+; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:164
+; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:156
+; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:148
+; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:140
+; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:132
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_u16 v147, off, s32 offset:124
+; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:116
+; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:108
+; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:100
+; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:92
+; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:84
+; GFX11-NEXT: scratch_load_u16 v166, off, s32 offset:76
+; GFX11-NEXT: scratch_load_u16 v180, off, s32 offset:68
+; GFX11-NEXT: scratch_load_u16 v181, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v182, off, s32 offset:52
+; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v45, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v46, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v47, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v56, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v62, off, s32 offset:4
+; GFX11-NEXT: v_lshlrev_b32_e32 v89, 8, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v90, 8, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v91, 8, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v92, 8, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v93, 8, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v76, 8, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v77, 8, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v78, 8, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v79, 8, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v88, 8, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v63, 8, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v72, 8, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v73, 8, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v74, 8, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v75, 8, v29
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(62)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v57, 8, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v58, 8, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v59, 8, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v60, 8, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v61, 8, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v40, 8, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v41, 8, v14
+; GFX11-NEXT: v_lshlrev_b32_e32 v42, 8, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v43, 8, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v44, 8, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v167, 8, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v176, 8, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v177, 8, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v178, 8, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v179, 8, v30
+; GFX11-NEXT: v_lshlrev_b32_e32 v149, 8, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v150, 8, v87
+; GFX11-NEXT: s_waitcnt vmcnt(61)
+; GFX11-NEXT: v_lshlrev_b32_e32 v151, 8, v96
+; GFX11-NEXT: s_waitcnt vmcnt(60)
+; GFX11-NEXT: v_lshlrev_b32_e32 v160, 8, v97
+; GFX11-NEXT: s_waitcnt vmcnt(59)
+; GFX11-NEXT: v_lshlrev_b32_e32 v161, 8, v98
+; GFX11-NEXT: s_waitcnt vmcnt(58)
+; GFX11-NEXT: v_lshlrev_b32_e32 v132, 8, v99
+; GFX11-NEXT: s_waitcnt vmcnt(57)
+; GFX11-NEXT: v_lshlrev_b32_e32 v133, 8, v113
+; GFX11-NEXT: s_waitcnt vmcnt(56)
+; GFX11-NEXT: v_lshlrev_b32_e32 v134, 8, v114
+; GFX11-NEXT: s_waitcnt vmcnt(55)
+; GFX11-NEXT: v_lshlrev_b32_e32 v135, 8, v115
+; GFX11-NEXT: s_waitcnt vmcnt(54)
+; GFX11-NEXT: v_lshlrev_b32_e32 v144, 8, v116
+; GFX11-NEXT: s_waitcnt vmcnt(53)
+; GFX11-NEXT: v_lshlrev_b32_e32 v119, 8, v117
+; GFX11-NEXT: s_waitcnt vmcnt(52)
+; GFX11-NEXT: v_lshlrev_b32_e32 v128, 8, v128
+; GFX11-NEXT: s_waitcnt vmcnt(51)
+; GFX11-NEXT: v_lshlrev_b32_e32 v129, 8, v129
+; GFX11-NEXT: s_waitcnt vmcnt(50)
+; GFX11-NEXT: v_lshlrev_b32_e32 v130, 8, v130
+; GFX11-NEXT: s_waitcnt vmcnt(49)
+; GFX11-NEXT: v_lshlrev_b32_e32 v131, 8, v131
+; GFX11-NEXT: s_waitcnt vmcnt(48)
+; GFX11-NEXT: v_lshlrev_b32_e32 v113, 8, v94
+; GFX11-NEXT: s_waitcnt vmcnt(47)
+; GFX11-NEXT: v_lshlrev_b32_e32 v114, 8, v95
+; GFX11-NEXT: s_waitcnt vmcnt(46)
+; GFX11-NEXT: v_lshlrev_b32_e32 v115, 8, v104
+; GFX11-NEXT: s_waitcnt vmcnt(45)
+; GFX11-NEXT: v_lshlrev_b32_e32 v116, 8, v105
+; GFX11-NEXT: s_waitcnt vmcnt(44)
+; GFX11-NEXT: v_lshlrev_b32_e32 v117, 8, v106
+; GFX11-NEXT: s_waitcnt vmcnt(43)
+; GFX11-NEXT: v_lshlrev_b32_e32 v87, 8, v107
+; GFX11-NEXT: s_waitcnt vmcnt(42)
+; GFX11-NEXT: v_lshlrev_b32_e32 v96, 8, v108
+; GFX11-NEXT: s_waitcnt vmcnt(41)
+; GFX11-NEXT: v_lshlrev_b32_e32 v97, 8, v109
+; GFX11-NEXT: s_waitcnt vmcnt(40)
+; GFX11-NEXT: v_lshlrev_b32_e32 v98, 8, v110
+; GFX11-NEXT: s_waitcnt vmcnt(39)
+; GFX11-NEXT: v_lshlrev_b32_e32 v99, 8, v111
+; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT: s_cbranch_scc0 .LBB59_4
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v54
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v53
+; GFX11-NEXT: s_and_b32 s5, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s29, 8
+; GFX11-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v90
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v91
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-NEXT: s_and_b32 s11, s26, 0xff
+; GFX11-NEXT: v_or_b32_e32 v5, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v50
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v49
+; GFX11-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v76
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v77
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v39
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v78
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v79
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v8, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v38
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v37
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v88
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v63
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v9, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v72
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v73
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v10, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v33
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v74
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v75
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v32
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v62
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v57
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v58
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v56
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v47
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v59
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v60
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v13, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v45
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v61
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v40
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v14, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v183
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v41
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v42
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v15, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v181
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v180
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v43
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v44
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v16, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v166
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v167
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v176
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v164
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v163
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v177
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v178
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v18, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v148
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v179
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v149
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v19, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v147
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v150
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v151
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v20, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v145
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v160
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v161
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v21, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v112
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v103
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v132
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v133
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v102
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v101
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v134
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v135
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v23, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v100
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v86
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v144
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v119
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v24, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v85
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v84
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v128
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v129
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v25, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v83
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v82
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v130
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v131
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v26, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v81
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v80
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v113
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v114
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v71
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v115
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v116
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v28, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v69
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v68
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v117
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v29, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v67
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v66
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v96
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v97
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v30, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v65
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v64
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v98
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v99
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v31, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v55
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v89
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT: s_lshl_b32 s8, s8, 16
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT: s_or_b32 s9, s9, s10
+; GFX11-NEXT: s_or_b32 s10, s11, s12
+; GFX11-NEXT: s_lshl_b32 s8, s8, 16
+; GFX11-NEXT: s_and_b32 s9, s9, 0xffff
+; GFX11-NEXT: s_lshl_b32 s10, s10, 16
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v51
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v52
+; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v93
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v92
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v6, v2, v3
+; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_vccnz .LBB59_3
+; GFX11-NEXT: .LBB59_2: ; %cmp.true
+; GFX11-NEXT: s_add_i32 s0, s0, 3
+; GFX11-NEXT: s_add_i32 s2, s2, 3
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s2
+; GFX11-NEXT: s_addk_i32 s0, 0x300
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-NEXT: s_add_i32 s16, s16, 3
+; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: s_and_b32 s1, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s2, s17, 8
+; GFX11-NEXT: s_add_i32 s18, s18, 3
+; GFX11-NEXT: s_or_b32 s1, s2, s1
+; GFX11-NEXT: s_and_b32 s2, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s19, 8
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_add_i32 s20, s20, 3
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_and_b32 s3, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s21, 8
+; GFX11-NEXT: s_add_i32 s22, s22, 3
+; GFX11-NEXT: s_or_b32 s1, s1, s2
+; GFX11-NEXT: s_or_b32 s2, s4, s3
+; GFX11-NEXT: s_and_b32 s3, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s23, 8
+; GFX11-NEXT: s_add_i32 s24, s24, 3
+; GFX11-NEXT: s_or_b32 s3, s4, s3
+; GFX11-NEXT: s_and_b32 s4, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s25, 8
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_addk_i32 s3, 0x300
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: s_add_i32 s26, s26, 3
+; GFX11-NEXT: s_or_b32 s2, s2, s3
+; GFX11-NEXT: s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT: s_and_b32 s4, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s27, 8
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v55
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v54
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v52
+; GFX11-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v51
+; GFX11-NEXT: s_or_b32 s3, s3, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v53
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v38
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v34
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v33
+; GFX11-NEXT: v_or_b32_e32 v0, v89, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v90, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v91, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v92, v3
+; GFX11-NEXT: v_or_b32_e32 v4, v93, v4
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: s_add_i32 s28, s28, 3
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT: v_or_b32_e32 v7, v88, v7
+; GFX11-NEXT: v_or_b32_e32 v11, v74, v11
+; GFX11-NEXT: v_or_b32_e32 v12, v75, v12
+; GFX11-NEXT: s_and_b32 s5, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s29, 8
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_or_b32 s5, s6, s5
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: s_addk_i32 s5, 0x300
+; GFX11-NEXT: v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: v_or_b32_e32 v6, v3, v6
+; GFX11-NEXT: v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v50
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v49
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v48
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v39
+; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v37
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v36
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v35
+; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v12
+; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v46
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v181
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v180
+; GFX11-NEXT: v_or_b32_e32 v0, v76, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v77, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v78, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v79, v3
+; GFX11-NEXT: v_or_b32_e32 v7, v63, v7
+; GFX11-NEXT: v_or_b32_e32 v8, v72, v8
+; GFX11-NEXT: v_or_b32_e32 v10, v73, v10
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT: v_or_b32_e32 v12, v61, v12
+; GFX11-NEXT: v_or_b32_e32 v16, v43, v16
+; GFX11-NEXT: v_or_b32_e32 v17, v44, v17
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT: v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v8, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v9, v9, v13
+; GFX11-NEXT: v_or_b32_e32 v10, v14, v10
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v32
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v62
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v56
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v47
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v45
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v183
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v182
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v162
+; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v145
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v118
+; GFX11-NEXT: v_or_b32_e32 v0, v57, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v58, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v59, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v60, v3
+; GFX11-NEXT: v_or_b32_e32 v12, v40, v12
+; GFX11-NEXT: v_or_b32_e32 v13, v41, v13
+; GFX11-NEXT: v_or_b32_e32 v15, v42, v15
+; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21
+; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT: v_or_b32_e32 v17, v179, v17
+; GFX11-NEXT: v_or_b32_e32 v21, v160, v21
+; GFX11-NEXT: v_or_b32_e32 v22, v161, v22
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x300, v21
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT: v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v13, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v14, v14, v18
+; GFX11-NEXT: v_or_b32_e32 v15, v19, v15
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v166
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v165
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v164
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v163
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v148
+; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v147
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v146
+; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18
+; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20
+; GFX11-NEXT: v_or_b32_e32 v21, v21, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v100
+; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v83
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v82
+; GFX11-NEXT: v_or_b32_e32 v0, v167, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v176, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v177, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v178, v3
+; GFX11-NEXT: v_or_b32_e32 v17, v149, v17
+; GFX11-NEXT: v_or_b32_e32 v18, v150, v18
+; GFX11-NEXT: v_or_b32_e32 v20, v151, v20
+; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26
+; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
+; GFX11-NEXT: v_or_b32_e32 v22, v144, v22
+; GFX11-NEXT: v_or_b32_e32 v26, v130, v26
+; GFX11-NEXT: v_or_b32_e32 v27, v131, v27
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v17
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x300, v26
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT: v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v18, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v19, v19, v23
+; GFX11-NEXT: v_or_b32_e32 v20, v24, v20
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v112
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v103
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v102
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v101
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v86
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v85
+; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v84
+; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23
+; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25
+; GFX11-NEXT: v_or_b32_e32 v26, v26, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v69
+; GFX11-NEXT: v_or_b32_e32 v0, v132, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v133, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v134, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v135, v3
+; GFX11-NEXT: v_or_b32_e32 v22, v119, v22
+; GFX11-NEXT: v_or_b32_e32 v23, v128, v23
+; GFX11-NEXT: v_or_b32_e32 v25, v129, v25
+; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x300, v23
+; GFX11-NEXT: v_add_nc_u32_e32 v25, 0x300, v25
+; GFX11-NEXT: v_or_b32_e32 v27, v117, v27
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v22
+; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT: v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v23, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v24, v24, v28
+; GFX11-NEXT: v_or_b32_e32 v25, v29, v25
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v81
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v80
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v71
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v70
+; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v68
+; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v67
+; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v66
+; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v65
+; GFX11-NEXT: v_add_nc_u32_e32 v32, 3, v64
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28
+; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30
+; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31
+; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-NEXT: v_or_b32_e32 v0, v113, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v114, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v115, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v116, v3
+; GFX11-NEXT: v_or_b32_e32 v27, v87, v27
+; GFX11-NEXT: v_or_b32_e32 v28, v96, v28
+; GFX11-NEXT: v_or_b32_e32 v30, v97, v30
+; GFX11-NEXT: v_or_b32_e32 v31, v98, v31
+; GFX11-NEXT: v_or_b32_e32 v32, v99, v32
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x300, v28
+; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x300, v30
+; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x300, v31
+; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x300, v32
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v27
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-NEXT: v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v28, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v29, v29, v33
+; GFX11-NEXT: v_or_b32_e32 v30, v34, v30
+; GFX11-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: .LBB59_3: ; %end
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:320
+; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:324
+; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:328
+; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:332
+; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:336
+; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:340
+; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:344
+; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:348
+; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:352
+; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:356
+; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:360
+; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:364
+; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:368
+; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:372
+; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:376
+; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:380
+; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:384
+; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:388
+; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:392
+; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:396
+; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:400
+; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:404
+; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:408
+; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:412
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:416
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:420
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:424
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:428
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:432
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:436
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:440
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:444
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:448
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:452
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:456
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:460
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:464
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:468
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:472
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:476
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-NEXT: .LBB59_4:
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-NEXT: s_branch .LBB59_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -121411,876 +118822,913 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:536
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:532
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:528
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:524
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:520
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:516
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:512
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:508
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:504
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:500
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:496
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:492
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:488
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:484
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:480
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:476
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:472
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:468
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:464
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:460
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:456
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:452
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:448
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:444
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:440
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:436
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:432
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:428
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:424
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:420
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:416
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:412
+; GFX11-TRUE16-NEXT: s_clause 0x4
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:408
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:404
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:400
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:396
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:392
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v161, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_u16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT: scratch_load_u16 v112, off, s32 offset:380
+; GFX11-TRUE16-NEXT: scratch_load_u16 v32, off, s32 offset:376
+; GFX11-TRUE16-NEXT: scratch_load_u16 v113, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_u16 v56, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_u16 v114, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_u16 v33, off, s32 offset:360
+; GFX11-TRUE16-NEXT: scratch_load_u16 v115, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_u16 v57, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_u16 v116, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_u16 v34, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_u16 v117, off, s32 offset:340
+; GFX11-TRUE16-NEXT: scratch_load_u16 v58, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_u16 v118, off, s32 offset:332
+; GFX11-TRUE16-NEXT: scratch_load_u16 v35, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_u16 v119, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_u16 v59, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_u16 v128, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_u16 v36, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_u16 v129, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_u16 v60, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_u16 v130, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_u16 v37, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_u16 v131, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_u16 v61, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_u16 v132, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_u16 v38, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_u16 v133, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_u16 v62, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_u16 v134, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_u16 v39, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_u16 v135, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v162, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v163, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_u16 v144, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_u16 v145, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_u16 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_u16 v146, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_u16 v49, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_u16 v147, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_u16 v73, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_u16 v148, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_u16 v50, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_u16 v52, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v53, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v54, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v55, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v64, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v65, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_u16 v66, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_u16 v67, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_u16 v74, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_u16 v75, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_u16 v76, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_u16 v77, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_u16 v78, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_u16 v79, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_u16 v88, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_u16 v89, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_u16 v90, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_u16 v91, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_u16 v92, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_u16 v93, off, s32 offset:152
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_u16 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_u16 v95, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_u16 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_u16 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_u16 v106, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_u16 v107, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_u16 v108, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_u16 v149, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_u16 v150, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_u16 v151, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_u16 v160, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_u16 v161, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_u16 v162, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_u16 v163, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_u16 v164, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_u16 v165, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_u16 v166, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_u16 v167, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_u16 v176, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_u16 v177, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_u16 v178, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_u16 v179, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_u16 v180, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_u16 v181, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_u16 v182, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_u16 v183, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_u16 v40, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_u16 v41, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_u16 v42, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_u16 v43, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v44, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v45, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
+; GFX11-TRUE16-NEXT: scratch_load_u16 v46, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v47, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v31.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.l, 8, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v56.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v57.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v58.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v59.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v60.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.h, 8, v61.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v62.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v63.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v48.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v72.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v49.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v73.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v51
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v52.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v53.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v54.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v55.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v74.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v75.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v76.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v77.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v78.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v79.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v88.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v89.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v90.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v91.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v92.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v93.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v94.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v95.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v104.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v105.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v106.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v107.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v108.l
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB74_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
-; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB74_4
-; GFX11-TRUE16-NEXT: .LBB74_2: ; %end
-; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB74_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v101.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v0.l, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v103.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v102.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v96.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v101, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v1.h, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v100.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v2.l, v100.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v101, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v3.l, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v99.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v101, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v4.l, v96.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v85.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v47.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v46.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v101, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v5.l, v87.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v81.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v45.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v44.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v101, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v6.l, v85.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v43.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v42.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v41.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v101, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v7.l, v83.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v40.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v183.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v182.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v101, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v8.l, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v68.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v67.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v181.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v180.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v101, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v9.l, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v179.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v66.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v178.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v177.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v101, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v10.l, v69.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v65.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v176.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v167.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v166.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v101, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v11.l, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v165.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v164.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v101, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v12.l, v67.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v163.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v162.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v161.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v101, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v13.l, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v160.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v101, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v14.l, v65.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v51.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v149.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v148.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v101, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v15.l, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v50.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v101, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v16.l, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v144.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v101, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v17.l, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v132.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v101, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v18.l, v53.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v101, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v19.l, v52.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v118.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v101, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v20.l, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v35.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v116.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v101, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v21.l, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v113.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v101, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v22.l, v49.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v112.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v101, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v23.l, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v32.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v101, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v24.l, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v101, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v25.l, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v101, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v26.l, v37.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v101, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v27.l, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v101, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v28.l, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v101, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v29.l, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v101, v29
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v30.l, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v101, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v101.l, v31.l, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v101.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v101, v31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: .LBB74_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2
-; GFX11-TRUE16-NEXT: .LBB74_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_4
+; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v101.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v101.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v98.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v98.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v103.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v102.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v102.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v97.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v96.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v86.h, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v86.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v100.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v100.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v99.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v99.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v84.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v83.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v82.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v82.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v96.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v97.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v87.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v87.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v71.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v71.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v70.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v69.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v85.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v85.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v83.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v84.l, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v47.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v46.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v45.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v44.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v81.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v81.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v80.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v80.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v43.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v42.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v41.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v40.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v69.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v70.l, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v68.l, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v68.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v183.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v182.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v181.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v180.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v67.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v67.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v66.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v66.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v179.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v178.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v177.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v176.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v65.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v65.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v64.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v64.h, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v167.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v166.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v165.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v164.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v55.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v55.h, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v54.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v54.h, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v163.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v162.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v161.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v160.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v53.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v53.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v52.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v52.h, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v151.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v150.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v149.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v148.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v51.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v51.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v50.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v50.h, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v147.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v146.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v144.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v49.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v49.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v48.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v48.h, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v135.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v134.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v133.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v132.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v39.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v39.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v38.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v38.h, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v131.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v130.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v129.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v128.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v37.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v37.h, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v36.l, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v36.h, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v119.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v118.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v117.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v116.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v35.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v35.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v35.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v34.l, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v34.h, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v115.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v114.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v35
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v113.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v33.l, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v33.h, v33.h, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v33.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v30.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
@@ -122288,7 +119736,48 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT: .LBB74_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:392
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:396
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:400
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:404
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:408
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:412
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:416
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:420
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:424
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:428
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:432
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:436
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:440
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:444
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:448
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:452
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:456
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:460
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:464
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:468
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:472
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:476
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:480
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:484
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:488
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:492
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:496
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:500
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:504
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:508
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:512
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:516
+; GFX11-TRUE16-NEXT: s_clause 0x4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:520
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:524
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:528
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:532
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:536
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64:
@@ -126554,1887 +124043,946 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:464
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:460
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:456
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:452
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:448
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:444
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:440
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:436
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:432
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:428
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:424
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:420
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:416
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:412
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:408
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:404
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:400
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:396
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:392
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT: s_clause 0x7
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:320
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v2, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v8, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v10, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v16, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v18, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v20, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v22, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v24, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v26, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v28, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v30, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v94, off, s32 offset:240
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v95, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v104, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v105, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v106, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v107, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v108, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v109, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v110, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v111, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:132
-; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB75_4
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v53
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v90
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v91
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v76
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v77
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v88
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v63
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v72
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v73
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v33
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v74
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v75
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v57
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v58
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v56
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v47
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v60
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v61
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v40
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v41
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v42
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v43
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v167
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v176
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v164
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v163
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v177
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v179
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v149
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v150
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v151
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v160
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v161
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v112
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v103
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v132
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v133
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v102
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v134
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v135
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v144
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v119
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v84
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v128
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v129
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v130
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v80
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v113
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v114
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v115
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v116
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v68
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v117
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v87
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v96
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v97
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v64
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v98
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v99
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s9, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s10, 16
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v51
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v93
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v92
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB75_3
-; GFX11-TRUE16-NEXT: .LBB75_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v54
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v52
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v51
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v38
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v89, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v90, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v92, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v93, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v88, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v74, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v75, v12
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v46
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v181
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v78, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v79, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v63, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v72, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v73, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v61, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v43, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v44, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v62
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v56
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v45
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v183
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v162
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v145
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v118
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v57, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v58, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v59, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v60, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v40, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v41, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v42, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v160, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v161, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v19, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v166
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v165
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v164
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v148
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v147
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v146
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v100
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v83
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v167, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v176, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v177, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v178, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v150, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v151, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v130, v26
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v131, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v19, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v24, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v112
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v102
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v101
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v86
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v85
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xff, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v69
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v133, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v134, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v135, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v119, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v128, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v129, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v117, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v24, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v29, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v81
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v71
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v67
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 3, v66
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v65
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v64
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xff, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v113, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v114, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v115, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v116, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v87, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v96, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v97, v30
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v98, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v99, v32
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v29, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v30
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: .LBB75_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:392
-; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:396
-; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:400
-; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:404
-; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:408
-; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:412
-; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:416
-; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:420
-; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:424
-; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:428
-; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:432
-; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
-; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
-; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT: s_clause 0x7
-; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
-; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
-; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
-; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:460
-; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:464
-; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:468
-; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:472
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:476
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB75_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT: s_branch .LBB75_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:444
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v107, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v108, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v109, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v110, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v111, s32 offset:320
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v87, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v97, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:152
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v113, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v128, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_u16 v130, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:240
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v95, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v104, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v105, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v106, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v107, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v108, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v109, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v110, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v111, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v66, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v70, off, s32 offset:260
-; GFX11-FAKE16-NEXT: scratch_load_u16 v71, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v81, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v83, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v85, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v101, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v102, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v112, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:132
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v147, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v166, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v180, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v29
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v57, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v58, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v40, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v43, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v176, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v178, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v179, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v149, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v150, 8, v87
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v151, 8, v96
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v160, 8, v97
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v98
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v99
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 8, v113
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v135, 8, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v144, 8, v116
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 8, v117
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v128
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 8, v129
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v130
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 8, v131
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v94
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 8, v95
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 8, v104
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 8, v105
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 8, v106
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v107
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 8, v108
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v109
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 8, v110
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v99, 8, v111
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB75_4
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v54
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v53
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v90
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v91
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v50
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v49
-; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v76
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v77
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v48
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v39
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v88
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v63
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v36
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v72
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v73
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v33
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v74
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v75
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v62
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v57
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v58
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v56
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v47
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v60
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v46
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v61
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v40
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v41
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v42
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v43
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v165
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v167
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v176
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v164
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v163
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v177
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v178
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v162
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v148
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v179
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v149
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v147
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v150
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v161
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v103
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v132
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v133
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v102
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v101
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v134
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v135
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v144
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v119
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v84
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v129
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v130
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v80
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v114
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v115
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v116
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v68
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v87
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v67
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v96
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v98
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v99
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v0, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v55
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s9, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s10, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v51
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v52
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v93
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v92
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v2, v3
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB75_3
-; GFX11-FAKE16-NEXT: .LBB75_2: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v55
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v54
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v52
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v51
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v38
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v34
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v89, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v90, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v92, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v93, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v88, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v74, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v75, v12
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v3, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v50
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v49
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v48
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v39
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v37
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v36
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v46
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v181
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v78, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v79, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v63, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v72, v8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v73, v10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v61, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v43, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v44, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v14, v10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v32
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v62
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v56
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v47
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v45
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v183
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v182
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v13
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v162
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v145
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v118
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v57, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v58, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v59, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v60, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v40, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v41, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v42, v15
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xff, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v179, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v160, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v161, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v21
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v166
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v165
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v164
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v148
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v147
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v146
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xff, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v18
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xff, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v21, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v100
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v83
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v167, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v176, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v177, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v178, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v150, v18
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v151, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xff, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v144, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v130, v26
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v131, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v26
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, v19, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v24, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v112
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v103
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v102
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v101
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v86
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v85
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xff, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xff, v23
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xff, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v69
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v133, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v134, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v135, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v119, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v128, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v129, v25
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v23
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v25
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v117, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v24, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, v29, v25
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v81
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v80
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v71
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v68
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v67
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 3, v66
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 3, v65
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 3, v64
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xff, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xff, v28
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xff, v30
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xff, v31
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xff, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v113, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v114, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v115, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v116, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v87, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v96, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v97, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v98, v31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v32, v99, v32
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v28
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v30
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v31
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v33, 16, v27
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, v29, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, v34, v30
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v31, v32
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: .LBB75_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_b32 v108, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_b32 v107, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_b32 v106, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_b32 v105, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_b32 v104, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_b32 v95, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT: s_clause 0x7
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:460
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:464
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:468
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:472
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:476
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT: .LBB75_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT: s_branch .LBB75_2
+; GFX11-LABEL: bitcast_v128i8_to_v16f64_scalar:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:476
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:472
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:468
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:464
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:460
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:456
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:452
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:448
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:444
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:440
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:436
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:432
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:428
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:424
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:420
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:416
+; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:412
+; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:408
+; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:404
+; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:400
+; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:396
+; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:392
+; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:388
+; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:384
+; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:380
+; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:376
+; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:372
+; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:368
+; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:364
+; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:360
+; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:356
+; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:352
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:348
+; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:344
+; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:340
+; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:336
+; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:332
+; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:328
+; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:324
+; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:320
+; GFX11-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
+; GFX11-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
+; GFX11-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
+; GFX11-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
+; GFX11-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
+; GFX11-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
+; GFX11-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:316
+; GFX11-NEXT: scratch_load_u16 v2, off, s32
+; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v16, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v18, off, s32 offset:64
+; GFX11-NEXT: scratch_load_u16 v20, off, s32 offset:72
+; GFX11-NEXT: scratch_load_u16 v22, off, s32 offset:80
+; GFX11-NEXT: scratch_load_u16 v24, off, s32 offset:88
+; GFX11-NEXT: scratch_load_u16 v26, off, s32 offset:96
+; GFX11-NEXT: scratch_load_u16 v28, off, s32 offset:104
+; GFX11-NEXT: scratch_load_u16 v30, off, s32 offset:112
+; GFX11-NEXT: scratch_load_u16 v31, off, s32 offset:120
+; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:128
+; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:136
+; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:144
+; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:152
+; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:160
+; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:168
+; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:176
+; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:184
+; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:192
+; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:200
+; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:208
+; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:216
+; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:224
+; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:232
+; GFX11-NEXT: scratch_load_u16 v94, off, s32 offset:240
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_u16 v95, off, s32 offset:248
+; GFX11-NEXT: scratch_load_u16 v104, off, s32 offset:256
+; GFX11-NEXT: scratch_load_u16 v105, off, s32 offset:264
+; GFX11-NEXT: scratch_load_u16 v106, off, s32 offset:272
+; GFX11-NEXT: scratch_load_u16 v107, off, s32 offset:280
+; GFX11-NEXT: scratch_load_u16 v108, off, s32 offset:288
+; GFX11-NEXT: scratch_load_u16 v109, off, s32 offset:296
+; GFX11-NEXT: scratch_load_u16 v110, off, s32 offset:304
+; GFX11-NEXT: scratch_load_u16 v111, off, s32 offset:312
+; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:308
+; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:300
+; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:292
+; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:284
+; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:276
+; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:268
+; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:260
+; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:252
+; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:244
+; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:236
+; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:228
+; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:220
+; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:212
+; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:204
+; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:196
+; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:188
+; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:180
+; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:172
+; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:164
+; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:156
+; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:148
+; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:140
+; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:132
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_u16 v147, off, s32 offset:124
+; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:116
+; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:108
+; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:100
+; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:92
+; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:84
+; GFX11-NEXT: scratch_load_u16 v166, off, s32 offset:76
+; GFX11-NEXT: scratch_load_u16 v180, off, s32 offset:68
+; GFX11-NEXT: scratch_load_u16 v181, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v182, off, s32 offset:52
+; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v45, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v46, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v47, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v56, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v62, off, s32 offset:4
+; GFX11-NEXT: v_lshlrev_b32_e32 v89, 8, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v90, 8, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v91, 8, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v92, 8, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v93, 8, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v76, 8, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v77, 8, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v78, 8, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v79, 8, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v88, 8, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v63, 8, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v72, 8, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v73, 8, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v74, 8, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v75, 8, v29
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(62)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v57, 8, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v58, 8, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v59, 8, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v60, 8, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v61, 8, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v40, 8, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v41, 8, v14
+; GFX11-NEXT: v_lshlrev_b32_e32 v42, 8, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v43, 8, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v44, 8, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v167, 8, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v176, 8, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v177, 8, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v178, 8, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v179, 8, v30
+; GFX11-NEXT: v_lshlrev_b32_e32 v149, 8, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v150, 8, v87
+; GFX11-NEXT: s_waitcnt vmcnt(61)
+; GFX11-NEXT: v_lshlrev_b32_e32 v151, 8, v96
+; GFX11-NEXT: s_waitcnt vmcnt(60)
+; GFX11-NEXT: v_lshlrev_b32_e32 v160, 8, v97
+; GFX11-NEXT: s_waitcnt vmcnt(59)
+; GFX11-NEXT: v_lshlrev_b32_e32 v161, 8, v98
+; GFX11-NEXT: s_waitcnt vmcnt(58)
+; GFX11-NEXT: v_lshlrev_b32_e32 v132, 8, v99
+; GFX11-NEXT: s_waitcnt vmcnt(57)
+; GFX11-NEXT: v_lshlrev_b32_e32 v133, 8, v113
+; GFX11-NEXT: s_waitcnt vmcnt(56)
+; GFX11-NEXT: v_lshlrev_b32_e32 v134, 8, v114
+; GFX11-NEXT: s_waitcnt vmcnt(55)
+; GFX11-NEXT: v_lshlrev_b32_e32 v135, 8, v115
+; GFX11-NEXT: s_waitcnt vmcnt(54)
+; GFX11-NEXT: v_lshlrev_b32_e32 v144, 8, v116
+; GFX11-NEXT: s_waitcnt vmcnt(53)
+; GFX11-NEXT: v_lshlrev_b32_e32 v119, 8, v117
+; GFX11-NEXT: s_waitcnt vmcnt(52)
+; GFX11-NEXT: v_lshlrev_b32_e32 v128, 8, v128
+; GFX11-NEXT: s_waitcnt vmcnt(51)
+; GFX11-NEXT: v_lshlrev_b32_e32 v129, 8, v129
+; GFX11-NEXT: s_waitcnt vmcnt(50)
+; GFX11-NEXT: v_lshlrev_b32_e32 v130, 8, v130
+; GFX11-NEXT: s_waitcnt vmcnt(49)
+; GFX11-NEXT: v_lshlrev_b32_e32 v131, 8, v131
+; GFX11-NEXT: s_waitcnt vmcnt(48)
+; GFX11-NEXT: v_lshlrev_b32_e32 v113, 8, v94
+; GFX11-NEXT: s_waitcnt vmcnt(47)
+; GFX11-NEXT: v_lshlrev_b32_e32 v114, 8, v95
+; GFX11-NEXT: s_waitcnt vmcnt(46)
+; GFX11-NEXT: v_lshlrev_b32_e32 v115, 8, v104
+; GFX11-NEXT: s_waitcnt vmcnt(45)
+; GFX11-NEXT: v_lshlrev_b32_e32 v116, 8, v105
+; GFX11-NEXT: s_waitcnt vmcnt(44)
+; GFX11-NEXT: v_lshlrev_b32_e32 v117, 8, v106
+; GFX11-NEXT: s_waitcnt vmcnt(43)
+; GFX11-NEXT: v_lshlrev_b32_e32 v87, 8, v107
+; GFX11-NEXT: s_waitcnt vmcnt(42)
+; GFX11-NEXT: v_lshlrev_b32_e32 v96, 8, v108
+; GFX11-NEXT: s_waitcnt vmcnt(41)
+; GFX11-NEXT: v_lshlrev_b32_e32 v97, 8, v109
+; GFX11-NEXT: s_waitcnt vmcnt(40)
+; GFX11-NEXT: v_lshlrev_b32_e32 v98, 8, v110
+; GFX11-NEXT: s_waitcnt vmcnt(39)
+; GFX11-NEXT: v_lshlrev_b32_e32 v99, 8, v111
+; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT: s_cbranch_scc0 .LBB75_4
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v54
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v53
+; GFX11-NEXT: s_and_b32 s5, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s29, 8
+; GFX11-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v90
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v91
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-NEXT: s_and_b32 s11, s26, 0xff
+; GFX11-NEXT: v_or_b32_e32 v5, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v50
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v49
+; GFX11-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v76
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v77
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v48
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v39
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v78
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v79
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v8, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v38
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v37
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v88
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v63
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v9, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v36
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v72
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v73
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v10, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v33
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v74
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v75
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v32
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v62
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v57
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v58
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v56
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v47
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v59
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v60
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v13, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v46
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v45
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v61
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v40
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v14, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v183
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v41
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v42
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v15, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v181
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v180
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v43
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v44
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v16, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v166
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v165
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v167
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v176
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v164
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v163
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v177
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v178
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v18, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v162
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v148
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v179
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v149
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v19, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v147
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v150
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v151
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v20, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v145
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v160
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v161
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v21, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v112
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v103
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v132
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v133
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v102
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v101
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v134
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v135
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v23, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v100
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v86
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v144
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v119
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v24, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v85
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v84
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v128
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v129
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v25, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v83
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v82
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v130
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v131
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v26, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v81
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v80
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v113
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v114
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v71
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v70
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v115
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v116
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v28, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v69
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v68
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v117
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v87
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v29, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v67
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v66
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v96
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v97
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v30, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v65
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v64
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v98
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v99
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v31, v0, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v55
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v89
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT: s_lshl_b32 s8, s8, 16
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT: s_or_b32 s9, s9, s10
+; GFX11-NEXT: s_or_b32 s10, s11, s12
+; GFX11-NEXT: s_lshl_b32 s8, s8, 16
+; GFX11-NEXT: s_and_b32 s9, s9, 0xffff
+; GFX11-NEXT: s_lshl_b32 s10, s10, 16
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v51
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v52
+; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v93
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v92
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v6, v2, v3
+; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_vccnz .LBB75_3
+; GFX11-NEXT: .LBB75_2: ; %cmp.true
+; GFX11-NEXT: s_add_i32 s0, s0, 3
+; GFX11-NEXT: s_add_i32 s2, s2, 3
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s2
+; GFX11-NEXT: s_addk_i32 s0, 0x300
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-NEXT: s_add_i32 s16, s16, 3
+; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: s_and_b32 s1, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s2, s17, 8
+; GFX11-NEXT: s_add_i32 s18, s18, 3
+; GFX11-NEXT: s_or_b32 s1, s2, s1
+; GFX11-NEXT: s_and_b32 s2, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s19, 8
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_add_i32 s20, s20, 3
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_and_b32 s3, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s21, 8
+; GFX11-NEXT: s_add_i32 s22, s22, 3
+; GFX11-NEXT: s_or_b32 s1, s1, s2
+; GFX11-NEXT: s_or_b32 s2, s4, s3
+; GFX11-NEXT: s_and_b32 s3, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s23, 8
+; GFX11-NEXT: s_add_i32 s24, s24, 3
+; GFX11-NEXT: s_or_b32 s3, s4, s3
+; GFX11-NEXT: s_and_b32 s4, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s25, 8
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_addk_i32 s3, 0x300
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: s_add_i32 s26, s26, 3
+; GFX11-NEXT: s_or_b32 s2, s2, s3
+; GFX11-NEXT: s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT: s_and_b32 s4, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s27, 8
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v55
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v54
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v52
+; GFX11-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v51
+; GFX11-NEXT: s_or_b32 s3, s3, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v53
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v38
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v34
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v33
+; GFX11-NEXT: v_or_b32_e32 v0, v89, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v90, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v91, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v92, v3
+; GFX11-NEXT: v_or_b32_e32 v4, v93, v4
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: s_add_i32 s28, s28, 3
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT: v_or_b32_e32 v7, v88, v7
+; GFX11-NEXT: v_or_b32_e32 v11, v74, v11
+; GFX11-NEXT: v_or_b32_e32 v12, v75, v12
+; GFX11-NEXT: s_and_b32 s5, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s29, 8
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_or_b32 s5, s6, s5
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: s_addk_i32 s5, 0x300
+; GFX11-NEXT: v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: v_or_b32_e32 v6, v3, v6
+; GFX11-NEXT: v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v50
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v49
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v48
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v39
+; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v37
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v36
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v35
+; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v12
+; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v46
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v181
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v180
+; GFX11-NEXT: v_or_b32_e32 v0, v76, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v77, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v78, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v79, v3
+; GFX11-NEXT: v_or_b32_e32 v7, v63, v7
+; GFX11-NEXT: v_or_b32_e32 v8, v72, v8
+; GFX11-NEXT: v_or_b32_e32 v10, v73, v10
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 0x300, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT: v_or_b32_e32 v12, v61, v12
+; GFX11-NEXT: v_or_b32_e32 v16, v43, v16
+; GFX11-NEXT: v_or_b32_e32 v17, v44, v17
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT: v_or_b32_e32 v7, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v8, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v9, v9, v13
+; GFX11-NEXT: v_or_b32_e32 v10, v14, v10
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v32
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v62
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v56
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v47
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v45
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v183
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v182
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v162
+; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v145
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v118
+; GFX11-NEXT: v_or_b32_e32 v0, v57, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v58, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v59, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v60, v3
+; GFX11-NEXT: v_or_b32_e32 v12, v40, v12
+; GFX11-NEXT: v_or_b32_e32 v13, v41, v13
+; GFX11-NEXT: v_or_b32_e32 v15, v42, v15
+; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21
+; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x300, v13
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT: v_or_b32_e32 v17, v179, v17
+; GFX11-NEXT: v_or_b32_e32 v21, v160, v21
+; GFX11-NEXT: v_or_b32_e32 v22, v161, v22
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v12
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x300, v21
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT: v_or_b32_e32 v12, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v13, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v14, v14, v18
+; GFX11-NEXT: v_or_b32_e32 v15, v19, v15
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v166
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v165
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v164
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v163
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v148
+; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v147
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v146
+; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17
+; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18
+; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20
+; GFX11-NEXT: v_or_b32_e32 v21, v21, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v100
+; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v83
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v82
+; GFX11-NEXT: v_or_b32_e32 v0, v167, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v176, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v177, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v178, v3
+; GFX11-NEXT: v_or_b32_e32 v17, v149, v17
+; GFX11-NEXT: v_or_b32_e32 v18, v150, v18
+; GFX11-NEXT: v_or_b32_e32 v20, v151, v20
+; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26
+; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v17
+; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x300, v18
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x300, v20
+; GFX11-NEXT: v_or_b32_e32 v22, v144, v22
+; GFX11-NEXT: v_or_b32_e32 v26, v130, v26
+; GFX11-NEXT: v_or_b32_e32 v27, v131, v27
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v17
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x300, v26
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT: v_or_b32_e32 v17, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v18, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v19, v19, v23
+; GFX11-NEXT: v_or_b32_e32 v20, v24, v20
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v112
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v103
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v102
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v101
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v86
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v85
+; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v84
+; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22
+; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23
+; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25
+; GFX11-NEXT: v_or_b32_e32 v26, v26, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v69
+; GFX11-NEXT: v_or_b32_e32 v0, v132, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v133, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v134, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v135, v3
+; GFX11-NEXT: v_or_b32_e32 v22, v119, v22
+; GFX11-NEXT: v_or_b32_e32 v23, v128, v23
+; GFX11-NEXT: v_or_b32_e32 v25, v129, v25
+; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x300, v23
+; GFX11-NEXT: v_add_nc_u32_e32 v25, 0x300, v25
+; GFX11-NEXT: v_or_b32_e32 v27, v117, v27
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v22
+; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT: v_or_b32_e32 v22, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v23, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v24, v24, v28
+; GFX11-NEXT: v_or_b32_e32 v25, v29, v25
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v81
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v80
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v71
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v70
+; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v68
+; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v67
+; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v66
+; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v65
+; GFX11-NEXT: v_add_nc_u32_e32 v32, 3, v64
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27
+; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28
+; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30
+; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31
+; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32
+; GFX11-NEXT: v_or_b32_e32 v0, v113, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v114, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v115, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v116, v3
+; GFX11-NEXT: v_or_b32_e32 v27, v87, v27
+; GFX11-NEXT: v_or_b32_e32 v28, v96, v28
+; GFX11-NEXT: v_or_b32_e32 v30, v97, v30
+; GFX11-NEXT: v_or_b32_e32 v31, v98, v31
+; GFX11-NEXT: v_or_b32_e32 v32, v99, v32
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x300, v27
+; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x300, v28
+; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x300, v30
+; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x300, v31
+; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x300, v32
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v27
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-NEXT: v_or_b32_e32 v27, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v28, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v29, v29, v33
+; GFX11-NEXT: v_or_b32_e32 v30, v34, v30
+; GFX11-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: .LBB75_3: ; %end
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:320
+; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:324
+; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:328
+; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:332
+; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:336
+; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:340
+; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:344
+; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:348
+; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:352
+; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:356
+; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:360
+; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:364
+; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:368
+; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:372
+; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:376
+; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:380
+; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:384
+; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:388
+; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:392
+; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:396
+; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:400
+; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:404
+; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:408
+; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:412
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:416
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:420
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:424
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:428
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:432
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:436
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:440
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:444
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:448
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:452
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:456
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:460
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:464
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:468
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:472
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:476
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-NEXT: .LBB75_4:
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-NEXT: s_branch .LBB75_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -147206,766 +143754,814 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v146, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:520
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:516
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:512
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:508
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:504
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:500
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:496
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:492
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:488
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:484
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:480
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:476
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:472
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:468
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:464
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:460
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:456
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:452
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:448
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:444
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:440
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:436
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:432
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:428
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:424
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:420
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:416
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:412
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:408
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:404
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:400
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:396
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:392 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v135, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v160, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v117, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v118, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v119, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v128, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_u16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT: scratch_load_u16 v114, off, s32 offset:380
+; GFX11-TRUE16-NEXT: scratch_load_u16 v103, off, s32 offset:376
+; GFX11-TRUE16-NEXT: scratch_load_u16 v128, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_u16 v56, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_u16 v117, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_u16 v102, off, s32 offset:360
+; GFX11-TRUE16-NEXT: scratch_load_u16 v133, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_u16 v57, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_u16 v112, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_u16 v58, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_u16 v132, off, s32 offset:340
+; GFX11-TRUE16-NEXT: scratch_load_u16 v101, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_u16 v118, off, s32 offset:332
+; GFX11-TRUE16-NEXT: scratch_load_u16 v59, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_u16 v134, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_u16 v97, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_u16 v113, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_u16 v100, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_u16 v145, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_u16 v99, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_u16 v119, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_u16 v60, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_u16 v135, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_u16 v98, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_u16 v129, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_u16 v61, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_u16 v146, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_u16 v62, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_u16 v115, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_u16 v63, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_u16 v144, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v132, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v96, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_u16 v130, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_u16 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_u16 v147, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_u16 v84, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_u16 v116, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_u16 v87, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_u16 v148, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_u16 v86, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_u16 v131, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_u16 v73, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_u16 v53, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v66, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v67, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v68, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v69, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v70, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_u16 v71, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_u16 v80, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_u16 v81, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_u16 v82, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_u16 v83, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_u16 v85, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_u16 v74, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_u16 v75, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_u16 v76, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_u16 v77, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_u16 v78, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_u16 v79, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_u16 v88, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_u16 v89, off, s32 offset:152
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_u16 v90, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_u16 v91, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_u16 v92, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_u16 v93, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_u16 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_u16 v95, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_u16 v104, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_u16 v179, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_u16 v163, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_u16 v183, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_u16 v150, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_u16 v167, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_u16 v161, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_u16 v177, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_u16 v149, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_u16 v180, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_u16 v151, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_u16 v164, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_u16 v41, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_u16 v47, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_u16 v165, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_u16 v43, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_u16 v181, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_u16 v45, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_u16 v162, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_u16 v46, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_u16 v176, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_u16 v42, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_u16 v178, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_u16 v44, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v160, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v182, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v148.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v147.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v146.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v134.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160
+; GFX11-TRUE16-NEXT: scratch_load_u16 v166, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v40, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v19.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v29.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v31.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v103.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v56.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v57.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v58.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v59.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v60.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v98.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v61.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v62.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v63.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v96.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v72.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v86.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v73.l
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v74.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v75.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v76.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v77.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v78.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v79.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v88.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v89.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v90.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v91.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v92.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v93.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v94.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v95.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v104.l
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB88_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
-; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB88_4
-; GFX11-TRUE16-NEXT: .LBB88_2: ; %end
-; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB88_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v54.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v64.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB88_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v35.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v50.h
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v81.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v81.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v82.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v83.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v97.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v70.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v98.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v99.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v99.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v101.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v102.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v102.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v103.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v112.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v112.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v113.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v113.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v115.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v118.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v118.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v114.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v119.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v128.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v128.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v129.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v130.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v117.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v130.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v131.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v129.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v132.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v133.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v133.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v135.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v135.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v144.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v144.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v134.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v145.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v146.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v146.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v148.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v145.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v148.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v149.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v147.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v150.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v150.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v151.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v40.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v166.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v182.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v160.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v44.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v178.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v42.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v176.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v46.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v162.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v45.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v181.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v43.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v165.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v47.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v41.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v164.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v180.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v149.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v177.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v167.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v183.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v163.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v179.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v148.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v147.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v118.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v49.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v50.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v53.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v65.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v55.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v68.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v70.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v71.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v80.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v81.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v82.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v69.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v81.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v87.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v96.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v86.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v97.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v98.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v100.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v99.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v103.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT: .LBB88_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB88_2
-; GFX11-TRUE16-NEXT: .LBB88_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.l, 3
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB88_4
+; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v128.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v114.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v133.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v117.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v148.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v103.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v102.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v102.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v100.l, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v132.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v134.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v118.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v147.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v148.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v149.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v129.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v101.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v99.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v100.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v101.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v113.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v135.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v119.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v98.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v145.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v146.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v145.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v97.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v98.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v99.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v97.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v148.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v144.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v115.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v147.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v130.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v135.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v144.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v134.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v135.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v144.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v87.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v96.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v86.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v87.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v96.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v116.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v179.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v131.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v183.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v163.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v132.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v133.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v134.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v132.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v133.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v84.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v85.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v86.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v84.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v85.l, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v167.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v177.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v161.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v130.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v131.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v180.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v83.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v81.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v82.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v83.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v164.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v151.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v47.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v82.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v117.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v128.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v129.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v119.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v86.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v69.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v80.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v81.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v71.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v41.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v43.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v165.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v45.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v181.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v128.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v119.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v114.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v80.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v71.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v66.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v70.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v70.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v83.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v46.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v162.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v42.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v176.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v44.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v117.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v114.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v116.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v116.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v115.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v69.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v66.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v68.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v68.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v67.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v178.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v182.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v160.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v40.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v166.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v115.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v113.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v112.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v113.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v67.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v65.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v64.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v65.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v39.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v54.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v38.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v112.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v101.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v102.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v103.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v101.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v64.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v53.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v55.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v37.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v55.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v35.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v102.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v99.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v87.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v98.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v99.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v54.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v52.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v54.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v51.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v51.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v33.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v97.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v70.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v82.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v83.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v81.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v81.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v51.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v48.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v49.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v48.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v49.l, v2.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v32.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v32.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v33.l
+; GFX11-TRUE16-NEXT: .LBB88_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:392
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:396
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:400
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:404
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:408
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:412
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:416
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:420
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:424
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:428
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:432
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:436
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:440
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:444
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:448
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:452
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:456
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:460
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:464
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:468
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:472
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:476
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:480
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:484
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:488
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:492
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:496
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:500
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:504
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:508
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:512
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:516
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:520 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16:
@@ -152736,1657 +149332,831 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1e
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:428
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:424
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:420
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:416
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:412
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:408
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:404
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:400
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:396
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:392
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:320
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v2, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v8, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v10, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v16, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v18, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v20, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v22, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v24, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v26, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v28, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v30, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:240
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v94, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:132
-; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94
-; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB89_4
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s5
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v68
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v48
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v7, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v81
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v85
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v10, v97
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v87
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v99
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v98
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v113
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v101
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v116
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v14, v128
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v112
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v102
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v130
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v133
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v14, v132
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v129
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v161
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v166
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v144
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v134
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v147
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v167
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v149
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v177
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v165
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v42
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v41
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v115
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v45
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v119
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v56
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v60
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v61
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v63
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v62
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v73
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v72
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v176
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v75
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v74
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v77
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v76
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v88
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v91
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v90
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v92
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v93
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB89_3
-; GFX11-TRUE16-NEXT: .LBB89_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v58
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v92, v0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v93, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v40
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v90, v3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v183
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v89, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v88, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v78, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v79, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v179
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v179, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v176
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v164
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v163
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v160
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v75, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v74, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v73, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v72, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v146
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v145
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v63, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v131
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v62, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v60, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v61, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v119
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v59, v3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v115
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v165
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v56, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v162
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v45, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v44, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v42, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v41, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v149
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v180, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v133
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v177, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v166, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v167, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v161, v3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v147, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v99
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v130, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v103
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v98
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v54
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v39
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 3, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v113, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v128, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v100
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v101, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v102, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v97, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v87, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v86, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v85, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v84, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v49
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v83, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v82, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v81, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v71, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v80, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v69, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v112, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v68, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v67, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v66, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v32
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v65, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v116
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v135
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v179, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
-; GFX11-TRUE16-NEXT: .LBB89_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1e
-; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:392
-; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:396
-; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:400
-; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:404
-; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:408
-; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:412
-; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:416
-; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:420
-; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:424
-; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:428
-; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:432
-; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:436
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:440
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB89_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT: s_branch .LBB89_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1e
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:320
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:152
-; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:240
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB89_4
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT: v_and_b32_e64 v5, 0xffff, s5
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v68
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v4, v67
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB89_3
-; GFX11-FAKE16-NEXT: .LBB89_2: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v58
-; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v47
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v43
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v40
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v90, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v183
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v182
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v89, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v88, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v78, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v54
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v39
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 3, v33
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v101, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v102, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v55
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v52
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v87, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v51
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v86, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v84, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v50
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v49
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v83, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v48
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v82, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v38
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v81, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v80, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v37
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v36
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 3, v35
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v69, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v34
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v112, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v68, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v67, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v33
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v32
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v65, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v33
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v37
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v51
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v38
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
-; GFX11-FAKE16-NEXT: .LBB89_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1e
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:440
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT: .LBB89_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT: s_branch .LBB89_2
+; GFX11-LABEL: bitcast_v128i8_to_v64bf16_scalar:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1e
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:440
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:436
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:432
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:428
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:424
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:420
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:416
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:412
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:408
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:404
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:400
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:396
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:392
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:388
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:384
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:380
+; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:376
+; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:372
+; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:368
+; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:364
+; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:360
+; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:356
+; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:352
+; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:348
+; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:344
+; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:340
+; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:336
+; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:332
+; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:328
+; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:324
+; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:320
+; GFX11-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
+; GFX11-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
+; GFX11-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
+; GFX11-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-NEXT: v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
+; GFX11-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
+; GFX11-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:316
+; GFX11-NEXT: scratch_load_u16 v2, off, s32
+; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v16, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v18, off, s32 offset:64
+; GFX11-NEXT: scratch_load_u16 v20, off, s32 offset:72
+; GFX11-NEXT: scratch_load_u16 v22, off, s32 offset:80
+; GFX11-NEXT: scratch_load_u16 v24, off, s32 offset:88
+; GFX11-NEXT: scratch_load_u16 v26, off, s32 offset:96
+; GFX11-NEXT: scratch_load_u16 v28, off, s32 offset:104
+; GFX11-NEXT: scratch_load_u16 v30, off, s32 offset:112
+; GFX11-NEXT: scratch_load_u16 v31, off, s32 offset:120
+; GFX11-NEXT: scratch_load_u16 v41, off, s32 offset:128
+; GFX11-NEXT: scratch_load_u16 v44, off, s32 offset:136
+; GFX11-NEXT: scratch_load_u16 v45, off, s32 offset:144
+; GFX11-NEXT: scratch_load_u16 v56, off, s32 offset:152
+; GFX11-NEXT: scratch_load_u16 v59, off, s32 offset:160
+; GFX11-NEXT: scratch_load_u16 v60, off, s32 offset:168
+; GFX11-NEXT: scratch_load_u16 v61, off, s32 offset:176
+; GFX11-NEXT: scratch_load_u16 v62, off, s32 offset:184
+; GFX11-NEXT: scratch_load_u16 v63, off, s32 offset:192
+; GFX11-NEXT: scratch_load_u16 v72, off, s32 offset:200
+; GFX11-NEXT: scratch_load_u16 v73, off, s32 offset:208
+; GFX11-NEXT: scratch_load_u16 v74, off, s32 offset:216
+; GFX11-NEXT: scratch_load_u16 v75, off, s32 offset:224
+; GFX11-NEXT: scratch_load_u16 v76, off, s32 offset:232
+; GFX11-NEXT: scratch_load_u16 v77, off, s32 offset:240
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_u16 v78, off, s32 offset:248
+; GFX11-NEXT: scratch_load_u16 v79, off, s32 offset:256
+; GFX11-NEXT: scratch_load_u16 v88, off, s32 offset:264
+; GFX11-NEXT: scratch_load_u16 v89, off, s32 offset:272
+; GFX11-NEXT: scratch_load_u16 v90, off, s32 offset:280
+; GFX11-NEXT: scratch_load_u16 v91, off, s32 offset:288
+; GFX11-NEXT: scratch_load_u16 v92, off, s32 offset:296
+; GFX11-NEXT: scratch_load_u16 v93, off, s32 offset:304
+; GFX11-NEXT: scratch_load_u16 v94, off, s32 offset:312
+; GFX11-NEXT: scratch_load_u16 v57, off, s32 offset:308
+; GFX11-NEXT: scratch_load_u16 v58, off, s32 offset:300
+; GFX11-NEXT: scratch_load_u16 v46, off, s32 offset:292
+; GFX11-NEXT: scratch_load_u16 v47, off, s32 offset:284
+; GFX11-NEXT: scratch_load_u16 v40, off, s32 offset:276
+; GFX11-NEXT: scratch_load_u16 v43, off, s32 offset:268
+; GFX11-NEXT: scratch_load_u16 v182, off, s32 offset:260
+; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:252
+; GFX11-NEXT: scratch_load_u16 v178, off, s32 offset:244
+; GFX11-NEXT: scratch_load_u16 v181, off, s32 offset:236
+; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:228
+; GFX11-NEXT: scratch_load_u16 v176, off, s32 offset:220
+; GFX11-NEXT: scratch_load_u16 v160, off, s32 offset:212
+; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:204
+; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:196
+; GFX11-NEXT: scratch_load_u16 v150, off, s32 offset:188
+; GFX11-NEXT: scratch_load_u16 v135, off, s32 offset:180
+; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:172
+; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:164
+; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:156
+; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:148
+; GFX11-NEXT: scratch_load_u16 v179, off, s32 offset:140
+; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:132
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:124
+; GFX11-NEXT: scratch_load_u16 v149, off, s32 offset:116
+; GFX11-NEXT: scratch_load_u16 v151, off, s32 offset:108
+; GFX11-NEXT: scratch_load_u16 v144, off, s32 offset:100
+; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:92
+; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:84
+; GFX11-NEXT: scratch_load_u16 v133, off, s32 offset:76
+; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:68
+; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:52
+; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:4
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v84, 8, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v83, 8, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v86, 8, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v85, 8, v29
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(62)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v97, 8, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v87, 8, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v101, 8, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v102, 8, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v113, 8, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v112, 8, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v130, 8, v14
+; GFX11-NEXT: v_lshlrev_b32_e32 v128, 8, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v134, 8, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v132, 8, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v161, 8, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v147, 8, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v166, 8, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v167, 8, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v180, 8, v30
+; GFX11-NEXT: v_lshlrev_b32_e32 v177, 8, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v42, 8, v41
+; GFX11-NEXT: s_waitcnt vmcnt(61)
+; GFX11-NEXT: v_lshlrev_b32_e32 v41, 8, v44
+; GFX11-NEXT: s_waitcnt vmcnt(60)
+; GFX11-NEXT: v_lshlrev_b32_e32 v45, 8, v45
+; GFX11-NEXT: s_waitcnt vmcnt(59)
+; GFX11-NEXT: v_lshlrev_b32_e32 v44, 8, v56
+; GFX11-NEXT: s_waitcnt vmcnt(58)
+; GFX11-NEXT: v_lshlrev_b32_e32 v59, 8, v59
+; GFX11-NEXT: s_waitcnt vmcnt(57)
+; GFX11-NEXT: v_lshlrev_b32_e32 v56, 8, v60
+; GFX11-NEXT: s_waitcnt vmcnt(56)
+; GFX11-NEXT: v_lshlrev_b32_e32 v60, 8, v61
+; GFX11-NEXT: s_waitcnt vmcnt(55)
+; GFX11-NEXT: v_lshlrev_b32_e32 v61, 8, v62
+; GFX11-NEXT: s_waitcnt vmcnt(54)
+; GFX11-NEXT: v_lshlrev_b32_e32 v63, 8, v63
+; GFX11-NEXT: s_waitcnt vmcnt(53)
+; GFX11-NEXT: v_lshlrev_b32_e32 v62, 8, v72
+; GFX11-NEXT: s_waitcnt vmcnt(52)
+; GFX11-NEXT: v_lshlrev_b32_e32 v73, 8, v73
+; GFX11-NEXT: s_waitcnt vmcnt(51)
+; GFX11-NEXT: v_lshlrev_b32_e32 v72, 8, v74
+; GFX11-NEXT: s_waitcnt vmcnt(50)
+; GFX11-NEXT: v_lshlrev_b32_e32 v75, 8, v75
+; GFX11-NEXT: s_waitcnt vmcnt(49)
+; GFX11-NEXT: v_lshlrev_b32_e32 v74, 8, v76
+; GFX11-NEXT: s_waitcnt vmcnt(48)
+; GFX11-NEXT: v_lshlrev_b32_e32 v77, 8, v77
+; GFX11-NEXT: s_waitcnt vmcnt(47)
+; GFX11-NEXT: v_lshlrev_b32_e32 v76, 8, v78
+; GFX11-NEXT: s_waitcnt vmcnt(46)
+; GFX11-NEXT: v_lshlrev_b32_e32 v78, 8, v79
+; GFX11-NEXT: s_waitcnt vmcnt(45)
+; GFX11-NEXT: v_lshlrev_b32_e32 v79, 8, v88
+; GFX11-NEXT: s_waitcnt vmcnt(44)
+; GFX11-NEXT: v_lshlrev_b32_e32 v89, 8, v89
+; GFX11-NEXT: s_waitcnt vmcnt(43)
+; GFX11-NEXT: v_lshlrev_b32_e32 v88, 8, v90
+; GFX11-NEXT: s_waitcnt vmcnt(42)
+; GFX11-NEXT: v_lshlrev_b32_e32 v91, 8, v91
+; GFX11-NEXT: s_waitcnt vmcnt(41)
+; GFX11-NEXT: v_lshlrev_b32_e32 v90, 8, v92
+; GFX11-NEXT: s_waitcnt vmcnt(40)
+; GFX11-NEXT: v_lshlrev_b32_e32 v92, 8, v93
+; GFX11-NEXT: s_waitcnt vmcnt(39)
+; GFX11-NEXT: v_lshlrev_b32_e32 v93, 8, v94
+; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT: s_cbranch_scc0 .LBB89_4
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-NEXT: s_and_b32 s5, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s29, 8
+; GFX11-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-NEXT: v_and_b32_e64 v5, 0xffff, s5
+; GFX11-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-NEXT: s_and_b32 s11, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-NEXT: s_or_b32 s9, s9, s10
+; GFX11-NEXT: s_or_b32 s10, s11, s12
+; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-NEXT: s_pack_ll_b32_b16 s8, s9, s10
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v36
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v32
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v34
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v33
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v68
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v64
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v66
+; GFX11-NEXT: v_or_b32_e32 v6, v4, v67
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v65
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v38
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v39
+; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v49
+; GFX11-NEXT: v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v37
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v50
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v71
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v48
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v69
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v82
+; GFX11-NEXT: v_or_b32_e32 v9, v7, v80
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_or_b32_e32 v10, v8, v81
+; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v53
+; GFX11-NEXT: v_lshl_or_b32 v8, v9, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v55
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v51
+; GFX11-NEXT: v_lshl_or_b32 v9, v10, 16, v3
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v84
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v52
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v54
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v86
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v83
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v96
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v85
+; GFX11-NEXT: v_or_b32_e32 v12, v10, v97
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshl_or_b32 v10, v1, 16, v0
+; GFX11-NEXT: v_or_b32_e32 v0, v11, v87
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v99
+; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-NEXT: v_lshl_or_b32 v11, v3, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v103
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v114
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v98
+; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v12
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v100
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v113
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v101
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v116
+; GFX11-NEXT: v_or_b32_e32 v17, v14, v128
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v112
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v117
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v102
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_or_b32_e32 v13, v13, v130
+; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v133
+; GFX11-NEXT: v_or_b32_e32 v20, v14, v132
+; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v148
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v119
+; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v129
+; GFX11-NEXT: v_or_b32_e32 v16, v16, v161
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v13
+; GFX11-NEXT: v_lshl_or_b32 v13, v2, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v166
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v144
+; GFX11-NEXT: v_or_b32_e32 v15, v15, v134
+; GFX11-NEXT: v_or_b32_e32 v18, v18, v147
+; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v16
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v167
+; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v15
+; GFX11-NEXT: v_lshl_or_b32 v15, v17, 16, v19
+; GFX11-NEXT: v_lshl_or_b32 v17, v18, 16, v22
+; GFX11-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-NEXT: v_lshl_or_b32 v18, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v151
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v149
+; GFX11-NEXT: v_lshl_or_b32 v16, v20, 16, v21
+; GFX11-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v180
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v177
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v19, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v165
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v162
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v42
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v41
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v20, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v179
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v115
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v45
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v44
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v21, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v131
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v59
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v56
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v22, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v145
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v60
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v61
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v23, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v150
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v63
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v62
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v24, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v163
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v160
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v73
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v72
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v25, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v176
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v164
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v75
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v74
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v26, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v181
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v178
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v77
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v76
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v27, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v183
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v78
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v79
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v28, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v43
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v40
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v89
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v88
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v29, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v47
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v46
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v91
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v90
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v30, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v58
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v57
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v92
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v93
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v31, v1, 16, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_vccnz .LBB89_3
+; GFX11-NEXT: .LBB89_2: ; %cmp.true
+; GFX11-NEXT: s_add_i32 s28, s28, 3
+; GFX11-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-NEXT: s_add_i32 s24, s24, 3
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-NEXT: s_add_i32 s26, s26, 3
+; GFX11-NEXT: s_or_b32 s5, s6, s5
+; GFX11-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-NEXT: s_add_i32 s20, s20, 3
+; GFX11-NEXT: s_or_b32 s6, s7, s6
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_add_i32 s22, s22, 3
+; GFX11-NEXT: s_or_b32 s7, s8, s7
+; GFX11-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-NEXT: s_add_i32 s16, s16, 3
+; GFX11-NEXT: s_or_b32 s8, s9, s8
+; GFX11-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-NEXT: s_add_i32 s18, s18, 3
+; GFX11-NEXT: s_add_i32 s0, s0, 3
+; GFX11-NEXT: s_add_i32 s2, s2, 3
+; GFX11-NEXT: s_or_b32 s9, s10, s9
+; GFX11-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-NEXT: s_or_b32 s10, s11, s10
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s2
+; GFX11-NEXT: s_addk_i32 s9, 0x300
+; GFX11-NEXT: s_addk_i32 s0, 0x300
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_addk_i32 s10, 0x300
+; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-NEXT: s_waitcnt vmcnt(37)
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v58
+; GFX11-NEXT: s_addk_i32 s5, 0x300
+; GFX11-NEXT: s_addk_i32 s6, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v57
+; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-NEXT: s_waitcnt vmcnt(35)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v47
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: s_addk_i32 s7, 0x300
+; GFX11-NEXT: s_addk_i32 s8, 0x300
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v92, v0
+; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v46
+; GFX11-NEXT: v_or_b32_e32 v1, v93, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v91, v2
+; GFX11-NEXT: s_waitcnt vmcnt(33)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v43
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v40
+; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v4
+; GFX11-NEXT: v_or_b32_e32 v3, v90, v3
+; GFX11-NEXT: s_waitcnt vmcnt(31)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v183
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v182
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v89, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: s_waitcnt vmcnt(29)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v181
+; GFX11-NEXT: v_or_b32_e32 v0, v88, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v78, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v79, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v178
+; GFX11-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v178, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v77, v3
+; GFX11-NEXT: s_waitcnt vmcnt(27)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v176
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v164
+; GFX11-NEXT: s_waitcnt vmcnt(25)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v163
+; GFX11-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v76, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v160
+; GFX11-NEXT: v_or_b32_e32 v1, v75, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v160, 0x300, v0
+; GFX11-NEXT: v_or_b32_e32 v0, v74, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v73, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
+; GFX11-NEXT: s_waitcnt vmcnt(23)
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v150
+; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v72, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v146
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: s_waitcnt vmcnt(21)
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v145
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v135
+; GFX11-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v63, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: s_waitcnt vmcnt(19)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v131
+; GFX11-NEXT: v_or_b32_e32 v0, v62, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v60, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v61, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v118
+; GFX11-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v118, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v59, v3
+; GFX11-NEXT: s_waitcnt vmcnt(17)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v179
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v115
+; GFX11-NEXT: s_waitcnt vmcnt(15)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v165
+; GFX11-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v56, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v162
+; GFX11-NEXT: v_or_b32_e32 v1, v45, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-NEXT: v_or_b32_e32 v0, v44, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v42, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
+; GFX11-NEXT: s_waitcnt vmcnt(13)
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v151
+; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v41, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v149
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: s_waitcnt vmcnt(11)
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v148
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v144
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v180, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: s_waitcnt vmcnt(9)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v133
+; GFX11-NEXT: v_or_b32_e32 v0, v177, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v166, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v167, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v129
+; GFX11-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v161, v3
+; GFX11-NEXT: s_waitcnt vmcnt(7)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v119
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v117
+; GFX11-NEXT: s_waitcnt vmcnt(5)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v116
+; GFX11-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v147, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v114
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v99
+; GFX11-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
+; GFX11-NEXT: v_or_b32_e32 v0, v132, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v130, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v103
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v98
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v54
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v53
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v39
+; GFX11-NEXT: v_add_nc_u32_e32 v33, 3, v33
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
+; GFX11-NEXT: v_or_b32_e32 v4, v113, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v128, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v100
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-NEXT: v_or_b32_e32 v4, v101, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v102, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v96
+; GFX11-NEXT: v_or_b32_e32 v1, v134, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v97, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v55
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v52
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v87, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v51
+; GFX11-NEXT: v_or_b32_e32 v4, v86, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v85, v6
+; GFX11-NEXT: v_or_b32_e32 v6, v84, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v50
+; GFX11-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v49
+; GFX11-NEXT: v_or_b32_e32 v5, v83, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v48
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v7
+; GFX11-NEXT: v_or_b32_e32 v4, v82, v4
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v38
+; GFX11-NEXT: v_or_b32_e32 v5, v81, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
+; GFX11-NEXT: v_or_b32_e32 v4, v71, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v80, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v37
+; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
+; GFX11-NEXT: v_or_b32_e32 v4, v70, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v36
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v34
+; GFX11-NEXT: v_add_nc_u32_e32 v34, 3, v35
+; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v7
+; GFX11-NEXT: v_or_b32_e32 v5, v69, v5
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v34
+; GFX11-NEXT: v_or_b32_e32 v3, v112, v3
+; GFX11-NEXT: v_or_b32_e32 v4, v68, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v67, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v66, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v33
+; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v32
+; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x300, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v65, v8
+; GFX11-NEXT: v_and_b32_e64 v8, 0xffff, s4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v6
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_or_b32_e32 v4, v64, v4
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v36
+; GFX11-NEXT: v_lshl_or_b32 v5, v7, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v33
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v37
+; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_lshl_or_b32 v6, v32, 16, v7
+; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v35
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v51
+; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v38
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-NEXT: v_lshl_or_b32 v7, v34, 16, v8
+; GFX11-NEXT: v_lshl_or_b32 v8, v39, 16, v33
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v12
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v11
+; GFX11-NEXT: v_lshl_or_b32 v11, v50, 16, v32
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v1
+; GFX11-NEXT: v_lshl_or_b32 v9, v9, 16, v35
+; GFX11-NEXT: v_lshl_or_b32 v12, v15, 16, v14
+; GFX11-NEXT: v_lshl_or_b32 v13, v13, 16, v33
+; GFX11-NEXT: v_lshl_or_b32 v14, v3, 16, v34
+; GFX11-NEXT: v_lshl_or_b32 v16, v16, 16, v32
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v116
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v129
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v18
+; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v17
+; GFX11-NEXT: v_lshl_or_b32 v26, v26, 16, v36
+; GFX11-NEXT: v_lshl_or_b32 v17, v114, 16, v32
+; GFX11-NEXT: v_lshl_or_b32 v18, v144, 16, v33
+; GFX11-NEXT: v_lshl_or_b32 v20, v20, 16, v34
+; GFX11-NEXT: v_lshl_or_b32 v21, v21, 16, v35
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v115
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v135
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v131
+; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v23
+; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v27
+; GFX11-NEXT: v_lshl_or_b32 v22, v145, 16, v32
+; GFX11-NEXT: v_lshl_or_b32 v23, v118, 16, v33
+; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v34
+; GFX11-NEXT: v_lshl_or_b32 v25, v25, 16, v35
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v163
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v182
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v181
+; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v28
+; GFX11-NEXT: v_lshl_or_b32 v15, v2, 16, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: v_lshl_or_b32 v19, v133, 16, v19
+; GFX11-NEXT: v_lshl_or_b32 v27, v160, 16, v32
+; GFX11-NEXT: v_lshl_or_b32 v28, v178, 16, v33
+; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v34
+; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v35
+; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v36
+; GFX11-NEXT: .LBB89_3: ; %end
+; GFX11-NEXT: s_clause 0x1e
+; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:320
+; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:324
+; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:328
+; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:332
+; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:336
+; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:340
+; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:344
+; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:348
+; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:352
+; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:356
+; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:360
+; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:364
+; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:368
+; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:372
+; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:376
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:380
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:384
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:388
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:392
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:396
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:400
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:404
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:408
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:412
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:416
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:420
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:424
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:428
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:432
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:436
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:440
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-NEXT: .LBB89_4:
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-NEXT: s_branch .LBB89_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -173615,766 +169385,814 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v146, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:520
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:516
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:512
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:508
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:504
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:500
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:496
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:492
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:488
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:484
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:480
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:476
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:472
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:468
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:464
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:460
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:456
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:452
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:448
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:444
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:440
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:436
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:432
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:428
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:424
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:420
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:416
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:412
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:408
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:404
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:400
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:396
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:392 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v135, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v160, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v117, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v118, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v119, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v128, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_u16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT: scratch_load_u16 v114, off, s32 offset:380
+; GFX11-TRUE16-NEXT: scratch_load_u16 v103, off, s32 offset:376
+; GFX11-TRUE16-NEXT: scratch_load_u16 v128, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_u16 v56, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_u16 v117, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_u16 v102, off, s32 offset:360
+; GFX11-TRUE16-NEXT: scratch_load_u16 v133, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_u16 v57, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_u16 v112, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_u16 v58, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_u16 v132, off, s32 offset:340
+; GFX11-TRUE16-NEXT: scratch_load_u16 v101, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_u16 v118, off, s32 offset:332
+; GFX11-TRUE16-NEXT: scratch_load_u16 v59, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_u16 v134, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_u16 v97, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_u16 v113, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_u16 v100, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_u16 v145, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_u16 v99, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_u16 v119, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_u16 v60, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_u16 v135, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_u16 v98, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_u16 v129, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_u16 v61, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_u16 v146, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_u16 v62, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_u16 v115, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_u16 v63, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_u16 v144, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v132, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v96, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_u16 v130, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_u16 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_u16 v147, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_u16 v84, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_u16 v116, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_u16 v87, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_u16 v148, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_u16 v86, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_u16 v131, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_u16 v73, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_u16 v53, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v66, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v67, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v68, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v69, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v70, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_u16 v71, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_u16 v80, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_u16 v81, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_u16 v82, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_u16 v83, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_u16 v85, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_u16 v74, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_u16 v75, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_u16 v76, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_u16 v77, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_u16 v78, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_u16 v79, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_u16 v88, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_u16 v89, off, s32 offset:152
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_u16 v90, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_u16 v91, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_u16 v92, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_u16 v93, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_u16 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_u16 v95, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_u16 v104, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_u16 v179, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_u16 v163, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_u16 v183, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_u16 v150, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_u16 v167, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_u16 v161, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_u16 v177, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_u16 v149, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_u16 v180, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_u16 v151, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_u16 v164, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_u16 v41, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_u16 v47, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_u16 v165, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_u16 v43, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_u16 v181, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_u16 v45, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_u16 v162, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_u16 v46, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_u16 v176, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_u16 v42, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_u16 v178, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_u16 v44, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v160, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v182, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v148.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v147.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v146.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v134.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160
+; GFX11-TRUE16-NEXT: scratch_load_u16 v166, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v40, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v19.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v29.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v31.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v103.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v56.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v57.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v58.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v59.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v60.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v98.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v61.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v62.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v63.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v96.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v72.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v86.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v73.l
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v74.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v75.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v76.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v77.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v78.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v79.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v88.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v89.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v90.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v91.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v92.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v93.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v94.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v95.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v104.l
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB92_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
-; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB92_4
-; GFX11-TRUE16-NEXT: .LBB92_2: ; %end
-; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB92_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v54.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v64.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB92_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v35.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v50.h
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v81.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v81.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v82.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v83.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v97.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v70.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v98.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v99.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v99.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v101.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v102.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v102.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v103.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v112.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v112.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v113.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v113.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v115.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v118.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v118.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v114.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v119.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v128.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v128.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v129.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v130.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v117.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v130.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v131.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v129.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v132.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v133.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v133.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v135.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v135.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v144.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v144.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v134.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v145.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v146.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v146.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v148.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v145.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v148.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v149.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v147.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v150.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v150.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v151.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v40.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v166.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v182.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v160.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v44.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v178.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v42.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v176.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v46.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v162.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v45.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v181.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v43.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v165.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v47.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v41.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v164.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v180.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v149.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v177.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v167.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v183.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v163.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v179.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v148.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v147.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v118.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v49.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v50.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v53.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v65.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v55.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v68.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v70.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v71.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v80.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v81.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v82.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v69.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v81.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v87.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v96.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v86.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v97.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v98.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v100.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v99.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v103.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT: .LBB92_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB92_2
-; GFX11-TRUE16-NEXT: .LBB92_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.l, 3
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB92_4
+; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v128.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v114.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v133.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v117.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v148.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v103.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v102.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v102.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v100.l, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v132.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v134.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v118.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v147.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v148.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v149.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v129.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v101.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v99.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v100.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v101.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v113.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v135.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v119.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v98.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v145.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v146.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v145.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v97.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v98.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v99.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v97.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v148.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v144.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v115.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v147.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v130.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v135.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v144.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v134.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v135.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v144.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v87.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v96.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v86.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v87.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v96.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v116.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v179.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v131.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v183.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v163.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v132.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v133.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v134.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v132.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v133.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v84.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v85.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v86.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v84.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v85.l, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v167.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v177.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v161.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v130.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v131.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v180.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v83.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v81.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v82.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v83.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v164.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v151.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v47.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v82.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v117.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v128.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v129.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v119.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v86.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v69.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v80.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v81.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v71.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v41.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v43.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v165.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v45.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v181.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v128.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v119.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v114.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v80.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v71.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v66.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v70.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v70.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v83.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v46.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v162.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v42.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v176.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v44.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v117.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v114.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v116.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v116.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v115.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v69.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v66.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v68.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v68.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v67.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v178.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v182.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v160.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v40.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v166.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v115.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v113.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v112.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v113.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v67.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v65.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v64.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v65.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v39.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v54.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v38.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v112.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v101.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v102.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v103.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v101.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v64.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v53.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v55.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v37.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v55.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v35.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v102.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v99.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v87.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v98.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v99.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v54.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v52.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v54.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v51.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v51.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v33.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v97.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v70.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v82.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v83.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v81.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v81.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v51.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v48.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v49.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v48.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v49.l, v2.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v32.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v32.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v33.l
+; GFX11-TRUE16-NEXT: .LBB92_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:392
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:396
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:400
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:404
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:408
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:412
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:416
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:420
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:424
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:428
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:432
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:436
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:440
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:444
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:448
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:452
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:456
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:460
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:464
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:468
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:472
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:476
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:480
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:484
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:488
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:492
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:496
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:500
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:504
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:508
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:512
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:516
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:520 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16:
@@ -179049,1657 +174867,831 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1e
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:428
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:424
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:420
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:416
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:412
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:408
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:404
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:400
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:396
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:392
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:320
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v2, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v8, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v10, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v16, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v18, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v20, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v22, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v24, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v26, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v28, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v30, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:240
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v94, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:132
-; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94
-; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB93_4
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s5
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v68
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v48
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v7, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v81
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v85
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v10, v97
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v87
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v99
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v98
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v113
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v101
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v116
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v14, v128
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v112
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v102
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v130
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v133
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v14, v132
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v129
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v161
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v166
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v144
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v134
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v147
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v167
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v149
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v177
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v165
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v42
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v41
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v115
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v45
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v119
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v56
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v60
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v61
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v63
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v62
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v73
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v72
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v176
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v75
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v74
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v77
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v76
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v88
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v91
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v90
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v92
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v93
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB93_3
-; GFX11-TRUE16-NEXT: .LBB93_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v58
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v92, v0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v93, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v40
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v90, v3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v183
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v89, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v88, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v78, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v79, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v179
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v179, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v176
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v164
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v163
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v160
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v75, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v74, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v73, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v72, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v146
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v145
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v63, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v131
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v62, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v60, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v61, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v119
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v59, v3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v115
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v165
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v56, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v162
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v45, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v44, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v42, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v41, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v149
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v180, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v133
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v177, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v166, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v167, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v161, v3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v147, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v99
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v130, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v103
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v98
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v54
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v39
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 3, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v113, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v128, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v100
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v101, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v102, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v97, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v87, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v86, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v85, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v84, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v49
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v83, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v82, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v81, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v71, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v80, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v69, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v112, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v68, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v67, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v66, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v32
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v65, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v116
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v135
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v179, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
-; GFX11-TRUE16-NEXT: .LBB93_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1e
-; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:392
-; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:396
-; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:400
-; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:404
-; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:408
-; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:412
-; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:416
-; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:420
-; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:424
-; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:428
-; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:432
-; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:436
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:440
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB93_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT: s_branch .LBB93_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1e
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:320
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:152
-; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:240
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB93_4
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT: v_and_b32_e64 v5, 0xffff, s5
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v68
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v4, v67
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB93_3
-; GFX11-FAKE16-NEXT: .LBB93_2: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v58
-; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v47
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v43
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v40
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v90, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v183
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v182
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v89, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v88, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v78, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v54
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v39
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 3, v33
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v101, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v102, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v55
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v52
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v87, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v51
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v86, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v84, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v50
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v49
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v83, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v48
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v82, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v38
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v81, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v80, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v37
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v36
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 3, v35
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v69, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v34
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v112, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v68, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v67, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v33
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v32
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v65, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v33
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v37
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v51
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v38
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
-; GFX11-FAKE16-NEXT: .LBB93_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1e
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:440
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT: .LBB93_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT: s_branch .LBB93_2
+; GFX11-LABEL: bitcast_v128i8_to_v64f16_scalar:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1e
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:440
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:436
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:432
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:428
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:424
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:420
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:416
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:412
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:408
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:404
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:400
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:396
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:392
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:388
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:384
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:380
+; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:376
+; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:372
+; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:368
+; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:364
+; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:360
+; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:356
+; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:352
+; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:348
+; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:344
+; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:340
+; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:336
+; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:332
+; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:328
+; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:324
+; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:320
+; GFX11-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
+; GFX11-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
+; GFX11-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
+; GFX11-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-NEXT: v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
+; GFX11-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
+; GFX11-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:316
+; GFX11-NEXT: scratch_load_u16 v2, off, s32
+; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v16, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v18, off, s32 offset:64
+; GFX11-NEXT: scratch_load_u16 v20, off, s32 offset:72
+; GFX11-NEXT: scratch_load_u16 v22, off, s32 offset:80
+; GFX11-NEXT: scratch_load_u16 v24, off, s32 offset:88
+; GFX11-NEXT: scratch_load_u16 v26, off, s32 offset:96
+; GFX11-NEXT: scratch_load_u16 v28, off, s32 offset:104
+; GFX11-NEXT: scratch_load_u16 v30, off, s32 offset:112
+; GFX11-NEXT: scratch_load_u16 v31, off, s32 offset:120
+; GFX11-NEXT: scratch_load_u16 v41, off, s32 offset:128
+; GFX11-NEXT: scratch_load_u16 v44, off, s32 offset:136
+; GFX11-NEXT: scratch_load_u16 v45, off, s32 offset:144
+; GFX11-NEXT: scratch_load_u16 v56, off, s32 offset:152
+; GFX11-NEXT: scratch_load_u16 v59, off, s32 offset:160
+; GFX11-NEXT: scratch_load_u16 v60, off, s32 offset:168
+; GFX11-NEXT: scratch_load_u16 v61, off, s32 offset:176
+; GFX11-NEXT: scratch_load_u16 v62, off, s32 offset:184
+; GFX11-NEXT: scratch_load_u16 v63, off, s32 offset:192
+; GFX11-NEXT: scratch_load_u16 v72, off, s32 offset:200
+; GFX11-NEXT: scratch_load_u16 v73, off, s32 offset:208
+; GFX11-NEXT: scratch_load_u16 v74, off, s32 offset:216
+; GFX11-NEXT: scratch_load_u16 v75, off, s32 offset:224
+; GFX11-NEXT: scratch_load_u16 v76, off, s32 offset:232
+; GFX11-NEXT: scratch_load_u16 v77, off, s32 offset:240
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_u16 v78, off, s32 offset:248
+; GFX11-NEXT: scratch_load_u16 v79, off, s32 offset:256
+; GFX11-NEXT: scratch_load_u16 v88, off, s32 offset:264
+; GFX11-NEXT: scratch_load_u16 v89, off, s32 offset:272
+; GFX11-NEXT: scratch_load_u16 v90, off, s32 offset:280
+; GFX11-NEXT: scratch_load_u16 v91, off, s32 offset:288
+; GFX11-NEXT: scratch_load_u16 v92, off, s32 offset:296
+; GFX11-NEXT: scratch_load_u16 v93, off, s32 offset:304
+; GFX11-NEXT: scratch_load_u16 v94, off, s32 offset:312
+; GFX11-NEXT: scratch_load_u16 v57, off, s32 offset:308
+; GFX11-NEXT: scratch_load_u16 v58, off, s32 offset:300
+; GFX11-NEXT: scratch_load_u16 v46, off, s32 offset:292
+; GFX11-NEXT: scratch_load_u16 v47, off, s32 offset:284
+; GFX11-NEXT: scratch_load_u16 v40, off, s32 offset:276
+; GFX11-NEXT: scratch_load_u16 v43, off, s32 offset:268
+; GFX11-NEXT: scratch_load_u16 v182, off, s32 offset:260
+; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:252
+; GFX11-NEXT: scratch_load_u16 v178, off, s32 offset:244
+; GFX11-NEXT: scratch_load_u16 v181, off, s32 offset:236
+; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:228
+; GFX11-NEXT: scratch_load_u16 v176, off, s32 offset:220
+; GFX11-NEXT: scratch_load_u16 v160, off, s32 offset:212
+; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:204
+; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:196
+; GFX11-NEXT: scratch_load_u16 v150, off, s32 offset:188
+; GFX11-NEXT: scratch_load_u16 v135, off, s32 offset:180
+; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:172
+; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:164
+; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:156
+; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:148
+; GFX11-NEXT: scratch_load_u16 v179, off, s32 offset:140
+; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:132
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:124
+; GFX11-NEXT: scratch_load_u16 v149, off, s32 offset:116
+; GFX11-NEXT: scratch_load_u16 v151, off, s32 offset:108
+; GFX11-NEXT: scratch_load_u16 v144, off, s32 offset:100
+; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:92
+; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:84
+; GFX11-NEXT: scratch_load_u16 v133, off, s32 offset:76
+; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:68
+; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:52
+; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:4
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v84, 8, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v83, 8, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v86, 8, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v85, 8, v29
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(62)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v97, 8, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v87, 8, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v101, 8, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v102, 8, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v113, 8, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v112, 8, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v130, 8, v14
+; GFX11-NEXT: v_lshlrev_b32_e32 v128, 8, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v134, 8, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v132, 8, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v161, 8, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v147, 8, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v166, 8, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v167, 8, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v180, 8, v30
+; GFX11-NEXT: v_lshlrev_b32_e32 v177, 8, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v42, 8, v41
+; GFX11-NEXT: s_waitcnt vmcnt(61)
+; GFX11-NEXT: v_lshlrev_b32_e32 v41, 8, v44
+; GFX11-NEXT: s_waitcnt vmcnt(60)
+; GFX11-NEXT: v_lshlrev_b32_e32 v45, 8, v45
+; GFX11-NEXT: s_waitcnt vmcnt(59)
+; GFX11-NEXT: v_lshlrev_b32_e32 v44, 8, v56
+; GFX11-NEXT: s_waitcnt vmcnt(58)
+; GFX11-NEXT: v_lshlrev_b32_e32 v59, 8, v59
+; GFX11-NEXT: s_waitcnt vmcnt(57)
+; GFX11-NEXT: v_lshlrev_b32_e32 v56, 8, v60
+; GFX11-NEXT: s_waitcnt vmcnt(56)
+; GFX11-NEXT: v_lshlrev_b32_e32 v60, 8, v61
+; GFX11-NEXT: s_waitcnt vmcnt(55)
+; GFX11-NEXT: v_lshlrev_b32_e32 v61, 8, v62
+; GFX11-NEXT: s_waitcnt vmcnt(54)
+; GFX11-NEXT: v_lshlrev_b32_e32 v63, 8, v63
+; GFX11-NEXT: s_waitcnt vmcnt(53)
+; GFX11-NEXT: v_lshlrev_b32_e32 v62, 8, v72
+; GFX11-NEXT: s_waitcnt vmcnt(52)
+; GFX11-NEXT: v_lshlrev_b32_e32 v73, 8, v73
+; GFX11-NEXT: s_waitcnt vmcnt(51)
+; GFX11-NEXT: v_lshlrev_b32_e32 v72, 8, v74
+; GFX11-NEXT: s_waitcnt vmcnt(50)
+; GFX11-NEXT: v_lshlrev_b32_e32 v75, 8, v75
+; GFX11-NEXT: s_waitcnt vmcnt(49)
+; GFX11-NEXT: v_lshlrev_b32_e32 v74, 8, v76
+; GFX11-NEXT: s_waitcnt vmcnt(48)
+; GFX11-NEXT: v_lshlrev_b32_e32 v77, 8, v77
+; GFX11-NEXT: s_waitcnt vmcnt(47)
+; GFX11-NEXT: v_lshlrev_b32_e32 v76, 8, v78
+; GFX11-NEXT: s_waitcnt vmcnt(46)
+; GFX11-NEXT: v_lshlrev_b32_e32 v78, 8, v79
+; GFX11-NEXT: s_waitcnt vmcnt(45)
+; GFX11-NEXT: v_lshlrev_b32_e32 v79, 8, v88
+; GFX11-NEXT: s_waitcnt vmcnt(44)
+; GFX11-NEXT: v_lshlrev_b32_e32 v89, 8, v89
+; GFX11-NEXT: s_waitcnt vmcnt(43)
+; GFX11-NEXT: v_lshlrev_b32_e32 v88, 8, v90
+; GFX11-NEXT: s_waitcnt vmcnt(42)
+; GFX11-NEXT: v_lshlrev_b32_e32 v91, 8, v91
+; GFX11-NEXT: s_waitcnt vmcnt(41)
+; GFX11-NEXT: v_lshlrev_b32_e32 v90, 8, v92
+; GFX11-NEXT: s_waitcnt vmcnt(40)
+; GFX11-NEXT: v_lshlrev_b32_e32 v92, 8, v93
+; GFX11-NEXT: s_waitcnt vmcnt(39)
+; GFX11-NEXT: v_lshlrev_b32_e32 v93, 8, v94
+; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT: s_cbranch_scc0 .LBB93_4
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-NEXT: s_and_b32 s5, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s29, 8
+; GFX11-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-NEXT: v_and_b32_e64 v5, 0xffff, s5
+; GFX11-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-NEXT: s_and_b32 s11, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-NEXT: s_or_b32 s9, s9, s10
+; GFX11-NEXT: s_or_b32 s10, s11, s12
+; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-NEXT: s_pack_ll_b32_b16 s8, s9, s10
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v36
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v32
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v34
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v33
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v68
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v64
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v66
+; GFX11-NEXT: v_or_b32_e32 v6, v4, v67
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v65
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v38
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v39
+; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v49
+; GFX11-NEXT: v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v37
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v50
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v71
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v48
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v69
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v82
+; GFX11-NEXT: v_or_b32_e32 v9, v7, v80
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_or_b32_e32 v10, v8, v81
+; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v53
+; GFX11-NEXT: v_lshl_or_b32 v8, v9, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v55
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v51
+; GFX11-NEXT: v_lshl_or_b32 v9, v10, 16, v3
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v84
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v52
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v54
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v86
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v83
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v96
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v85
+; GFX11-NEXT: v_or_b32_e32 v12, v10, v97
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshl_or_b32 v10, v1, 16, v0
+; GFX11-NEXT: v_or_b32_e32 v0, v11, v87
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v99
+; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-NEXT: v_lshl_or_b32 v11, v3, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v103
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v114
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v98
+; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v12
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v100
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v113
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v101
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v116
+; GFX11-NEXT: v_or_b32_e32 v17, v14, v128
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v112
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v117
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v102
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_or_b32_e32 v13, v13, v130
+; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v133
+; GFX11-NEXT: v_or_b32_e32 v20, v14, v132
+; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v148
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v119
+; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v129
+; GFX11-NEXT: v_or_b32_e32 v16, v16, v161
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v13
+; GFX11-NEXT: v_lshl_or_b32 v13, v2, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v166
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v144
+; GFX11-NEXT: v_or_b32_e32 v15, v15, v134
+; GFX11-NEXT: v_or_b32_e32 v18, v18, v147
+; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v16
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v167
+; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v15
+; GFX11-NEXT: v_lshl_or_b32 v15, v17, 16, v19
+; GFX11-NEXT: v_lshl_or_b32 v17, v18, 16, v22
+; GFX11-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-NEXT: v_lshl_or_b32 v18, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v151
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v149
+; GFX11-NEXT: v_lshl_or_b32 v16, v20, 16, v21
+; GFX11-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v180
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v177
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v19, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v165
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v162
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v42
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v41
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v20, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v179
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v115
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v45
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v44
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v21, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v131
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v59
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v56
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v22, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v145
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v60
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v61
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v23, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v150
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v63
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v62
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v24, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v163
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v160
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v73
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v72
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v25, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v176
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v164
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v75
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v74
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v26, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v181
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v178
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v77
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v76
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v27, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v183
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v78
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v79
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v28, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v43
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v40
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v89
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v88
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v29, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v47
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v46
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v91
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v90
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v30, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v58
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v57
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v92
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v93
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v31, v1, 16, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_vccnz .LBB93_3
+; GFX11-NEXT: .LBB93_2: ; %cmp.true
+; GFX11-NEXT: s_add_i32 s28, s28, 3
+; GFX11-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-NEXT: s_add_i32 s24, s24, 3
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-NEXT: s_add_i32 s26, s26, 3
+; GFX11-NEXT: s_or_b32 s5, s6, s5
+; GFX11-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-NEXT: s_add_i32 s20, s20, 3
+; GFX11-NEXT: s_or_b32 s6, s7, s6
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_add_i32 s22, s22, 3
+; GFX11-NEXT: s_or_b32 s7, s8, s7
+; GFX11-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-NEXT: s_add_i32 s16, s16, 3
+; GFX11-NEXT: s_or_b32 s8, s9, s8
+; GFX11-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-NEXT: s_add_i32 s18, s18, 3
+; GFX11-NEXT: s_add_i32 s0, s0, 3
+; GFX11-NEXT: s_add_i32 s2, s2, 3
+; GFX11-NEXT: s_or_b32 s9, s10, s9
+; GFX11-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-NEXT: s_or_b32 s10, s11, s10
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s2
+; GFX11-NEXT: s_addk_i32 s9, 0x300
+; GFX11-NEXT: s_addk_i32 s0, 0x300
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_addk_i32 s10, 0x300
+; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-NEXT: s_waitcnt vmcnt(37)
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v58
+; GFX11-NEXT: s_addk_i32 s5, 0x300
+; GFX11-NEXT: s_addk_i32 s6, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v57
+; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-NEXT: s_waitcnt vmcnt(35)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v47
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: s_addk_i32 s7, 0x300
+; GFX11-NEXT: s_addk_i32 s8, 0x300
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v92, v0
+; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v46
+; GFX11-NEXT: v_or_b32_e32 v1, v93, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v91, v2
+; GFX11-NEXT: s_waitcnt vmcnt(33)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v43
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v40
+; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v4
+; GFX11-NEXT: v_or_b32_e32 v3, v90, v3
+; GFX11-NEXT: s_waitcnt vmcnt(31)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v183
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v182
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v89, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: s_waitcnt vmcnt(29)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v181
+; GFX11-NEXT: v_or_b32_e32 v0, v88, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v78, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v79, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v178
+; GFX11-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v178, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v77, v3
+; GFX11-NEXT: s_waitcnt vmcnt(27)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v176
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v164
+; GFX11-NEXT: s_waitcnt vmcnt(25)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v163
+; GFX11-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v76, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v160
+; GFX11-NEXT: v_or_b32_e32 v1, v75, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v160, 0x300, v0
+; GFX11-NEXT: v_or_b32_e32 v0, v74, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v73, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
+; GFX11-NEXT: s_waitcnt vmcnt(23)
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v150
+; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v72, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v146
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: s_waitcnt vmcnt(21)
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v145
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v135
+; GFX11-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v63, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: s_waitcnt vmcnt(19)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v131
+; GFX11-NEXT: v_or_b32_e32 v0, v62, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v60, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v61, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v118
+; GFX11-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v118, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v59, v3
+; GFX11-NEXT: s_waitcnt vmcnt(17)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v179
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v115
+; GFX11-NEXT: s_waitcnt vmcnt(15)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v165
+; GFX11-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v56, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v162
+; GFX11-NEXT: v_or_b32_e32 v1, v45, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-NEXT: v_or_b32_e32 v0, v44, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v42, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
+; GFX11-NEXT: s_waitcnt vmcnt(13)
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v151
+; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v41, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v149
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: s_waitcnt vmcnt(11)
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v148
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v144
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v180, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: s_waitcnt vmcnt(9)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v133
+; GFX11-NEXT: v_or_b32_e32 v0, v177, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v166, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v167, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v129
+; GFX11-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v161, v3
+; GFX11-NEXT: s_waitcnt vmcnt(7)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v119
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v117
+; GFX11-NEXT: s_waitcnt vmcnt(5)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v116
+; GFX11-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v147, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v114
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v99
+; GFX11-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
+; GFX11-NEXT: v_or_b32_e32 v0, v132, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v130, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v103
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v98
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v54
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v53
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v39
+; GFX11-NEXT: v_add_nc_u32_e32 v33, 3, v33
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
+; GFX11-NEXT: v_or_b32_e32 v4, v113, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v128, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v100
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-NEXT: v_or_b32_e32 v4, v101, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v102, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v96
+; GFX11-NEXT: v_or_b32_e32 v1, v134, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v97, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v55
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v52
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v87, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v51
+; GFX11-NEXT: v_or_b32_e32 v4, v86, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v85, v6
+; GFX11-NEXT: v_or_b32_e32 v6, v84, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v50
+; GFX11-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v49
+; GFX11-NEXT: v_or_b32_e32 v5, v83, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v48
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v7
+; GFX11-NEXT: v_or_b32_e32 v4, v82, v4
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v38
+; GFX11-NEXT: v_or_b32_e32 v5, v81, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
+; GFX11-NEXT: v_or_b32_e32 v4, v71, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v80, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v37
+; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
+; GFX11-NEXT: v_or_b32_e32 v4, v70, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v36
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v34
+; GFX11-NEXT: v_add_nc_u32_e32 v34, 3, v35
+; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v7
+; GFX11-NEXT: v_or_b32_e32 v5, v69, v5
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v34
+; GFX11-NEXT: v_or_b32_e32 v3, v112, v3
+; GFX11-NEXT: v_or_b32_e32 v4, v68, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v67, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v66, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v33
+; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v32
+; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x300, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v65, v8
+; GFX11-NEXT: v_and_b32_e64 v8, 0xffff, s4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v6
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_or_b32_e32 v4, v64, v4
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v36
+; GFX11-NEXT: v_lshl_or_b32 v5, v7, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v33
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v37
+; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_lshl_or_b32 v6, v32, 16, v7
+; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v35
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v51
+; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v38
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-NEXT: v_lshl_or_b32 v7, v34, 16, v8
+; GFX11-NEXT: v_lshl_or_b32 v8, v39, 16, v33
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v12
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v11
+; GFX11-NEXT: v_lshl_or_b32 v11, v50, 16, v32
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v1
+; GFX11-NEXT: v_lshl_or_b32 v9, v9, 16, v35
+; GFX11-NEXT: v_lshl_or_b32 v12, v15, 16, v14
+; GFX11-NEXT: v_lshl_or_b32 v13, v13, 16, v33
+; GFX11-NEXT: v_lshl_or_b32 v14, v3, 16, v34
+; GFX11-NEXT: v_lshl_or_b32 v16, v16, 16, v32
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v116
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v129
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v18
+; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v17
+; GFX11-NEXT: v_lshl_or_b32 v26, v26, 16, v36
+; GFX11-NEXT: v_lshl_or_b32 v17, v114, 16, v32
+; GFX11-NEXT: v_lshl_or_b32 v18, v144, 16, v33
+; GFX11-NEXT: v_lshl_or_b32 v20, v20, 16, v34
+; GFX11-NEXT: v_lshl_or_b32 v21, v21, 16, v35
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v115
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v135
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v131
+; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v23
+; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v27
+; GFX11-NEXT: v_lshl_or_b32 v22, v145, 16, v32
+; GFX11-NEXT: v_lshl_or_b32 v23, v118, 16, v33
+; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v34
+; GFX11-NEXT: v_lshl_or_b32 v25, v25, 16, v35
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v163
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v182
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v181
+; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v28
+; GFX11-NEXT: v_lshl_or_b32 v15, v2, 16, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: v_lshl_or_b32 v19, v133, 16, v19
+; GFX11-NEXT: v_lshl_or_b32 v27, v160, 16, v32
+; GFX11-NEXT: v_lshl_or_b32 v28, v178, 16, v33
+; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v34
+; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v35
+; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v36
+; GFX11-NEXT: .LBB93_3: ; %end
+; GFX11-NEXT: s_clause 0x1e
+; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:320
+; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:324
+; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:328
+; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:332
+; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:336
+; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:340
+; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:344
+; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:348
+; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:352
+; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:356
+; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:360
+; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:364
+; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:368
+; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:372
+; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:376
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:380
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:384
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:388
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:392
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:396
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:400
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:404
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:408
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:412
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:416
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:420
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:424
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:428
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:432
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:436
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:440
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-NEXT: .LBB93_4:
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-NEXT: s_branch .LBB93_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -196187,766 +191179,814 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v150, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v147, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v149, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v148, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v146, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v145, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v134, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v144, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:260
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:520
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:516
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:512
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:508
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:504
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:500
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:496
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:492
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:488
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:484
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:480
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:476
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:472
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:468
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:464
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:460
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:456
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:452
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:448
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:444
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:440
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:436
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:432
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:428
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:424
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:420
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:416
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:412
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:408
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:404
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:400
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:396
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:392 ; 4-byte Folded Spill
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v135, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:240
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v133, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v160, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v117, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v118, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v119, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v128, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v129, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_u16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT: scratch_load_u16 v114, off, s32 offset:380
+; GFX11-TRUE16-NEXT: scratch_load_u16 v103, off, s32 offset:376
+; GFX11-TRUE16-NEXT: scratch_load_u16 v128, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_u16 v56, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_u16 v117, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_u16 v102, off, s32 offset:360
+; GFX11-TRUE16-NEXT: scratch_load_u16 v133, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_u16 v57, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_u16 v112, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_u16 v58, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_u16 v132, off, s32 offset:340
+; GFX11-TRUE16-NEXT: scratch_load_u16 v101, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_u16 v118, off, s32 offset:332
+; GFX11-TRUE16-NEXT: scratch_load_u16 v59, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_u16 v134, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_u16 v97, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_u16 v113, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_u16 v100, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_u16 v145, off, s32 offset:308
+; GFX11-TRUE16-NEXT: scratch_load_u16 v99, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_u16 v119, off, s32 offset:300
+; GFX11-TRUE16-NEXT: scratch_load_u16 v60, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_u16 v135, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_u16 v98, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_u16 v129, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_u16 v61, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_u16 v146, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_u16 v62, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_u16 v115, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_u16 v63, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_u16 v144, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v130, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v131, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v132, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v151, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v96, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_u16 v130, off, s32 offset:252
+; GFX11-TRUE16-NEXT: scratch_load_u16 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_u16 v147, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_u16 v84, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_u16 v116, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_u16 v87, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_u16 v148, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_u16 v86, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_u16 v131, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_u16 v73, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_u16 v53, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v66, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v67, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v68, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v69, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v70, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_u16 v71, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_u16 v80, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_u16 v81, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_u16 v82, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_u16 v83, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_u16 v85, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_u16 v74, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_u16 v75, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_u16 v76, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_u16 v77, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_u16 v78, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_u16 v79, off, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_load_u16 v88, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_u16 v89, off, s32 offset:152
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_u16 v90, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_u16 v91, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_u16 v92, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_u16 v93, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_u16 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_u16 v95, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_u16 v104, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_u16 v179, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_u16 v163, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_u16 v183, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_u16 v150, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_u16 v167, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_u16 v161, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_u16 v177, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_u16 v149, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_u16 v180, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_u16 v151, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_u16 v164, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_u16 v41, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_u16 v47, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_u16 v165, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_u16 v43, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_u16 v181, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_u16 v45, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_u16 v162, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_u16 v46, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_u16 v176, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_u16 v42, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_u16 v178, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_u16 v44, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v160, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v182, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v149.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v148.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v148.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v147.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v146.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v145.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v134.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160
+; GFX11-TRUE16-NEXT: scratch_load_u16 v166, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v40, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v19.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v29.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v31.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v103.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v56.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v57.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v58.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v116.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v116.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v117.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v59.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v60.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v98.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v61.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v62.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v63.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v96.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v72.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v86.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v118.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v73.l
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v118.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v74.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v119.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v75.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v119.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v76.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v77.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v128.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v78.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v79.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v129.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v88.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v130.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v89.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v130.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v90.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v131.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v91.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v131.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v92.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v132.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v93.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v94.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v95.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v151.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v31.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v104.l
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB96_3
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
-; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB96_4
-; GFX11-TRUE16-NEXT: .LBB96_2: ; %end
-; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB96_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v54.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v64.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v35.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v68.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v50.h
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v81.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v81.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v82.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v83.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v97.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v70.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v98.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v99.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v99.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v101.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v102.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v102.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v103.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v112.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v112.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v113.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v113.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v115.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v118.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v118.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v114.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v119.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v128.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v128.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v129.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v130.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v117.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v130.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v131.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v129.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v132.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v133.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v133.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v135.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v135.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v144.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v144.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v134.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v145.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v146.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v146.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v148.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v145.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v148.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v149.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v147.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v150.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v150.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v151.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v40.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v166.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v182.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v160.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v44.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v178.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v42.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v176.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v46.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v162.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v45.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v181.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v43.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v165.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v47.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v41.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v164.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v180.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v149.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v177.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v167.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v183.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v163.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v179.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v148.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v147.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v118.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v49.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v50.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v53.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v65.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v55.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v68.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v70.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v71.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v80.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v81.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v82.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v69.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v81.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v87.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v96.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v86.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v97.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v98.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v100.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v99.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v103.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT: .LBB96_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_2
-; GFX11-TRUE16-NEXT: .LBB96_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v50.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v50.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.l, 3
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB96_4
+; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v128.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v114.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v133.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v117.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v148.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v103.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v102.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v102.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v100.l, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v132.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v134.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v118.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v147.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v148.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v149.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v129.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v101.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v99.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v100.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v101.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v113.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v135.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v119.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v98.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v145.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v146.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v145.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v97.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v98.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v99.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v97.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v148.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v144.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v115.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v147.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v130.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v135.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v144.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v134.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v135.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v144.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v87.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v96.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v86.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v87.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v96.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v116.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v100.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v179.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v131.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v98.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v183.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v163.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v132.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v133.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v134.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v132.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v133.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v84.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v85.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v86.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v84.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v85.l, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v97.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v85.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v167.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v96.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v86.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v177.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v161.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v87.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v131.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v129.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v130.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v131.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v180.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v83.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v81.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v82.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v83.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v71.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v80.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v164.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v151.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v96.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v130.l, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v47.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v82.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v117.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v128.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v129.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v119.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v86.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v69.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v80.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v81.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v71.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v41.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v85.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v43.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v165.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v82.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v45.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v181.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v128.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v119.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v114.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v118.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v118.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v80.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v71.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v66.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v70.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v70.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v83.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v46.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v162.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v71.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v69.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v42.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v176.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v70.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v44.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v117.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v114.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v116.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v116.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v115.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v69.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v66.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v68.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v68.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v67.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v178.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v68.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v66.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v182.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v160.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v67.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v66.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v40.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v166.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v115.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v113.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v103.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v112.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v113.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v67.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v65.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v64.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v65.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v39.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v54.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v38.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v112.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v101.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v102.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v103.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v101.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v64.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v53.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v55.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v37.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v55.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v35.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v102.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v99.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v87.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v98.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v99.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v54.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v52.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v54.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v51.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v51.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v33.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v97.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v70.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v82.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v83.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v81.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v81.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v51.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v48.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v49.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v48.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v49.l, v2.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v32.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v32.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v33.l
+; GFX11-TRUE16-NEXT: .LBB96_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_clause 0x1f
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:392
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:396
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:400
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:404
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:408
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:412
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:416
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:420
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:424
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:428
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:432
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:436
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:440
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:444
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:448
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:452
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:456
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:460
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:464
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:468
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:472
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:476
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:480
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:484
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:488
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:492
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:496
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:500
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:504
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:508
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:512
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:516
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:520 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16:
@@ -201554,1657 +196594,831 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1e
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:428
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:424
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:420
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:416
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:412
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:408
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:404
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:400
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:396
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:392
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:320
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v2, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v8, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v10, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v16, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v18, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v20, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v22, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v24, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v26, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v28, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v30, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:240
-; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:248
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:256
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:264
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:304
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v94, off, s32 offset:312
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:260
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:132
-; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94
-; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB97_4
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v5, 0xffff, s5
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v68
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v39
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v49
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v48
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v7, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v81
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v53
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v51
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v85
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v10, v97
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v87
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v99
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v98
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v113
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v101
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v116
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v14, v128
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v112
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v102
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v130
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v133
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v14, v132
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v148
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v129
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v161
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v166
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v144
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v134
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v147
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v167
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v149
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v180
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v177
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v165
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v42
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v41
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v115
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v45
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v119
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v56
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v60
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v61
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v63
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v62
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v160
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v73
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v72
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v176
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v164
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v75
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v74
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v179
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v77
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v76
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v88
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v91
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v90
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v92
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v93
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB97_3
-; GFX11-TRUE16-NEXT: .LBB97_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v58
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v47
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v92, v0
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v93, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v43
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v40
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v90, v3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v183
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v89, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v88, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v78, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v79, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v179
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v179, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v77, v3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v176
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v164
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v163
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v76, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v160
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v75, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v74, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v73, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v72, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v146
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v145
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v63, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v131
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v62, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v60, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v61, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v119
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v59, v3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v178
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v115
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v165
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v56, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v162
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v45, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v44, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v42, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v41, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v149
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v180, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v133
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v177, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v166, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v167, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v161, v3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v118
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v116
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v147, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v99
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v132, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v130, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v103
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v98
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v54
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v39
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 3, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v113, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v128, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v100
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v101, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v102, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v96
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v134, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v97, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v87, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v86, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v85, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v84, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v49
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v83, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v82, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v81, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v71, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v80, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v69, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v112, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v68, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v67, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v66, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v32
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v65, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v116
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v129
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v115
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v135
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v131
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v119, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v163
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v182
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v181
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v179, 16, v33
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
-; GFX11-TRUE16-NEXT: .LBB97_3: ; %end
-; GFX11-TRUE16-NEXT: s_clause 0x1e
-; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:336
-; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:344
-; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:380
-; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:384
-; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:392
-; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:396
-; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:400
-; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:404
-; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:408
-; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:412
-; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:416
-; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:420
-; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:424
-; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:428
-; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:432
-; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:436
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:440
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB97_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-TRUE16-NEXT: s_branch .LBB97_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1e
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v43, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v44, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v45, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v46, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v47, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v56, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v57, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v58, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v59, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v60, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v61, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v62, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v63, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v72, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v78, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v79, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v88, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v89, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v90, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v91, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v92, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:320
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:316
-; GFX11-FAKE16-NEXT: scratch_load_u16 v2, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v16, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v18, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u16 v20, off, s32 offset:72
-; GFX11-FAKE16-NEXT: scratch_load_u16 v22, off, s32 offset:80
-; GFX11-FAKE16-NEXT: scratch_load_u16 v24, off, s32 offset:88
-; GFX11-FAKE16-NEXT: scratch_load_u16 v26, off, s32 offset:96
-; GFX11-FAKE16-NEXT: scratch_load_u16 v28, off, s32 offset:104
-; GFX11-FAKE16-NEXT: scratch_load_u16 v30, off, s32 offset:112
-; GFX11-FAKE16-NEXT: scratch_load_u16 v31, off, s32 offset:120
-; GFX11-FAKE16-NEXT: scratch_load_u16 v41, off, s32 offset:128
-; GFX11-FAKE16-NEXT: scratch_load_u16 v44, off, s32 offset:136
-; GFX11-FAKE16-NEXT: scratch_load_u16 v45, off, s32 offset:144
-; GFX11-FAKE16-NEXT: scratch_load_u16 v56, off, s32 offset:152
-; GFX11-FAKE16-NEXT: scratch_load_u16 v59, off, s32 offset:160
-; GFX11-FAKE16-NEXT: scratch_load_u16 v60, off, s32 offset:168
-; GFX11-FAKE16-NEXT: scratch_load_u16 v61, off, s32 offset:176
-; GFX11-FAKE16-NEXT: scratch_load_u16 v62, off, s32 offset:184
-; GFX11-FAKE16-NEXT: scratch_load_u16 v63, off, s32 offset:192
-; GFX11-FAKE16-NEXT: scratch_load_u16 v72, off, s32 offset:200
-; GFX11-FAKE16-NEXT: scratch_load_u16 v73, off, s32 offset:208
-; GFX11-FAKE16-NEXT: scratch_load_u16 v74, off, s32 offset:216
-; GFX11-FAKE16-NEXT: scratch_load_u16 v75, off, s32 offset:224
-; GFX11-FAKE16-NEXT: scratch_load_u16 v76, off, s32 offset:232
-; GFX11-FAKE16-NEXT: scratch_load_u16 v77, off, s32 offset:240
-; GFX11-FAKE16-NEXT: s_clause 0x1f
-; GFX11-FAKE16-NEXT: scratch_load_u16 v78, off, s32 offset:248
-; GFX11-FAKE16-NEXT: scratch_load_u16 v79, off, s32 offset:256
-; GFX11-FAKE16-NEXT: scratch_load_u16 v88, off, s32 offset:264
-; GFX11-FAKE16-NEXT: scratch_load_u16 v89, off, s32 offset:272
-; GFX11-FAKE16-NEXT: scratch_load_u16 v90, off, s32 offset:280
-; GFX11-FAKE16-NEXT: scratch_load_u16 v91, off, s32 offset:288
-; GFX11-FAKE16-NEXT: scratch_load_u16 v92, off, s32 offset:296
-; GFX11-FAKE16-NEXT: scratch_load_u16 v93, off, s32 offset:304
-; GFX11-FAKE16-NEXT: scratch_load_u16 v94, off, s32 offset:312
-; GFX11-FAKE16-NEXT: scratch_load_u16 v57, off, s32 offset:308
-; GFX11-FAKE16-NEXT: scratch_load_u16 v58, off, s32 offset:300
-; GFX11-FAKE16-NEXT: scratch_load_u16 v46, off, s32 offset:292
-; GFX11-FAKE16-NEXT: scratch_load_u16 v47, off, s32 offset:284
-; GFX11-FAKE16-NEXT: scratch_load_u16 v40, off, s32 offset:276
-; GFX11-FAKE16-NEXT: scratch_load_u16 v43, off, s32 offset:268
-; GFX11-FAKE16-NEXT: scratch_load_u16 v182, off, s32 offset:260
-; GFX11-FAKE16-NEXT: scratch_load_u16 v183, off, s32 offset:252
-; GFX11-FAKE16-NEXT: scratch_load_u16 v178, off, s32 offset:244
-; GFX11-FAKE16-NEXT: scratch_load_u16 v181, off, s32 offset:236
-; GFX11-FAKE16-NEXT: scratch_load_u16 v164, off, s32 offset:228
-; GFX11-FAKE16-NEXT: scratch_load_u16 v176, off, s32 offset:220
-; GFX11-FAKE16-NEXT: scratch_load_u16 v160, off, s32 offset:212
-; GFX11-FAKE16-NEXT: scratch_load_u16 v163, off, s32 offset:204
-; GFX11-FAKE16-NEXT: scratch_load_u16 v146, off, s32 offset:196
-; GFX11-FAKE16-NEXT: scratch_load_u16 v150, off, s32 offset:188
-; GFX11-FAKE16-NEXT: scratch_load_u16 v135, off, s32 offset:180
-; GFX11-FAKE16-NEXT: scratch_load_u16 v145, off, s32 offset:172
-; GFX11-FAKE16-NEXT: scratch_load_u16 v118, off, s32 offset:164
-; GFX11-FAKE16-NEXT: scratch_load_u16 v131, off, s32 offset:156
-; GFX11-FAKE16-NEXT: scratch_load_u16 v115, off, s32 offset:148
-; GFX11-FAKE16-NEXT: scratch_load_u16 v179, off, s32 offset:140
-; GFX11-FAKE16-NEXT: scratch_load_u16 v162, off, s32 offset:132
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v165, off, s32 offset:124
-; GFX11-FAKE16-NEXT: scratch_load_u16 v149, off, s32 offset:116
-; GFX11-FAKE16-NEXT: scratch_load_u16 v151, off, s32 offset:108
-; GFX11-FAKE16-NEXT: scratch_load_u16 v144, off, s32 offset:100
-; GFX11-FAKE16-NEXT: scratch_load_u16 v148, off, s32 offset:92
-; GFX11-FAKE16-NEXT: scratch_load_u16 v129, off, s32 offset:84
-; GFX11-FAKE16-NEXT: scratch_load_u16 v133, off, s32 offset:76
-; GFX11-FAKE16-NEXT: scratch_load_u16 v117, off, s32 offset:68
-; GFX11-FAKE16-NEXT: scratch_load_u16 v119, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v114, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u16 v116, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v100, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v103, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v98, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v99, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v96, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v29
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v97, 8, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 8, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v101, 8, v6
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 8, v8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v113, 8, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 8, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 8, v14
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 8, v16
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 8, v18
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 8, v20
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v161, 8, v22
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v147, 8, v24
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v166, 8, v26
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v167, 8, v28
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v180, 8, v30
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v177, 8, v31
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v42, 8, v41
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v41, 8, v44
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(60)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v45, 8, v45
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v44, 8, v56
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(58)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v59, 8, v59
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(57)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v56, 8, v60
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(56)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v60, 8, v61
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(55)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v61, 8, v62
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v63, 8, v63
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v62, 8, v72
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v73, 8, v73
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v72, 8, v74
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v75, 8, v75
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v74, 8, v76
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v77, 8, v77
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v76, 8, v78
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v78, 8, v79
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v79, 8, v88
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v89, 8, v89
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v88, 8, v90
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v91, 8, v91
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v90, 8, v92
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v92, 8, v93
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v93, 8, v94
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB97_4
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s29, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT: v_and_b32_e64 v5, 0xffff, s5
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s11, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v34
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v68
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v4, v67
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v38
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v39
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v49
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v37
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v50
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v48
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v7, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v81
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v53
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v55
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v51
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v52
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v96
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v85
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v10, v97
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v11, v87
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v99
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v103
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v114
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v98
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v0, 16, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v100
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v113
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v101
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v116
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v14, v128
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v112
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v117
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v102
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v130
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v133
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v14, v132
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v0, 16, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v148
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v119
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xff, v129
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v16, v161
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v166
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v144
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v134
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, v18, v147
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v167
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff, v15
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v151
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v149
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v180
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v177
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v165
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v162
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v42
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v41
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v179
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v115
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v45
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v44
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v118
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v59
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v56
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v145
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v135
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v60
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v61
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v150
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v146
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v63
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v62
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v160
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v73
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v72
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v176
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v164
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v75
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v74
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v178
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v77
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v76
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v183
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v182
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v78
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v79
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v43
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v40
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v89
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v88
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v47
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v46
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v91
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v90
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v58
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v57
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v92
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v93
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB97_3
-; GFX11-FAKE16-NEXT: .LBB97_2: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v58
-; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v57
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v47
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v92, v0
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v46
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v93, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v91, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v43
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v40
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v90, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v183
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v182
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v89, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v181
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v88, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v78, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v79, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v178
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v178, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v77, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v176
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v164
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v163
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v76, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v160
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v75, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v160, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v74, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v73, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v150
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v72, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v146
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v145
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v135
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v63, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v131
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v62, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v60, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v61, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v118
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v118, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v59, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v179
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v115
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v165
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v56, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v162
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v45, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v44, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v42, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v151
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v41, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v149
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v148
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v144
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v180, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v133
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v177, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v166, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v167, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v129
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v161, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v119
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v117
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v116
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v147, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v114
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v99
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v132, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v130, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v103
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v98
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v54
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v39
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 3, v33
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v113, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v128, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v100
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v101, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v102, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v96
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v134, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v97, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v55
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v52
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v87, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v51
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v86, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v85, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v84, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v50
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v49
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v83, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v48
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v82, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v38
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v81, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v71, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v80, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v37
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v36
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 3, v35
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v69, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v34
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v112, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v68, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v67, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v33
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v32
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v65, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v33
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v37
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v51
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v38
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v116
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v129
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v18
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v17
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v115
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v135
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v131
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v23
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff, v27
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v23, v118, 16, v33
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v32, 0xffff, v163
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v33, 0xffff, v182
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff, v181
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v35, 0xffff, v28
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v2, 16, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v28, v178, 16, v33
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36
-; GFX11-FAKE16-NEXT: .LBB97_3: ; %end
-; GFX11-FAKE16-NEXT: s_clause 0x1e
-; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320
-; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324
-; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328
-; GFX11-FAKE16-NEXT: scratch_load_b32 v91, off, s32 offset:332
-; GFX11-FAKE16-NEXT: scratch_load_b32 v90, off, s32 offset:336
-; GFX11-FAKE16-NEXT: scratch_load_b32 v89, off, s32 offset:340
-; GFX11-FAKE16-NEXT: scratch_load_b32 v88, off, s32 offset:344
-; GFX11-FAKE16-NEXT: scratch_load_b32 v79, off, s32 offset:348
-; GFX11-FAKE16-NEXT: scratch_load_b32 v78, off, s32 offset:352
-; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:356
-; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:360
-; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:364
-; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:368
-; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:372
-; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:376
-; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:380
-; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:384
-; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:388
-; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:392
-; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:396
-; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:400
-; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:404
-; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:408
-; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:412
-; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:416
-; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:420
-; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:424
-; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:428
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:432
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:436
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:440
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT: .LBB97_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-FAKE16-NEXT: s_branch .LBB97_2
+; GFX11-LABEL: bitcast_v128i8_to_v64i16_scalar:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1e
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:440
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:436
+; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:432
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:428
+; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:424
+; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:420
+; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:416
+; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:412
+; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:408
+; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:404
+; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:400
+; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:396
+; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:392
+; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:388
+; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:384
+; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:380
+; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:376
+; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:372
+; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:368
+; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:364
+; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:360
+; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:356
+; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:352
+; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:348
+; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:344
+; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:340
+; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:336
+; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:332
+; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:328
+; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:324
+; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:320
+; GFX11-NEXT: v_dual_mov_b32 v54, v30 :: v_dual_mov_b32 v51, v24
+; GFX11-NEXT: v_dual_mov_b32 v52, v28 :: v_dual_mov_b32 v55, v26
+; GFX11-NEXT: v_dual_mov_b32 v53, v22 :: v_dual_mov_b32 v48, v20
+; GFX11-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-NEXT: v_dual_mov_b32 v49, v14 :: v_dual_mov_b32 v34, v8
+; GFX11-NEXT: v_dual_mov_b32 v37, v12 :: v_dual_mov_b32 v38, v10
+; GFX11-NEXT: v_dual_mov_b32 v36, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v32, v0
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:316
+; GFX11-NEXT: scratch_load_u16 v2, off, s32
+; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v16, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v18, off, s32 offset:64
+; GFX11-NEXT: scratch_load_u16 v20, off, s32 offset:72
+; GFX11-NEXT: scratch_load_u16 v22, off, s32 offset:80
+; GFX11-NEXT: scratch_load_u16 v24, off, s32 offset:88
+; GFX11-NEXT: scratch_load_u16 v26, off, s32 offset:96
+; GFX11-NEXT: scratch_load_u16 v28, off, s32 offset:104
+; GFX11-NEXT: scratch_load_u16 v30, off, s32 offset:112
+; GFX11-NEXT: scratch_load_u16 v31, off, s32 offset:120
+; GFX11-NEXT: scratch_load_u16 v41, off, s32 offset:128
+; GFX11-NEXT: scratch_load_u16 v44, off, s32 offset:136
+; GFX11-NEXT: scratch_load_u16 v45, off, s32 offset:144
+; GFX11-NEXT: scratch_load_u16 v56, off, s32 offset:152
+; GFX11-NEXT: scratch_load_u16 v59, off, s32 offset:160
+; GFX11-NEXT: scratch_load_u16 v60, off, s32 offset:168
+; GFX11-NEXT: scratch_load_u16 v61, off, s32 offset:176
+; GFX11-NEXT: scratch_load_u16 v62, off, s32 offset:184
+; GFX11-NEXT: scratch_load_u16 v63, off, s32 offset:192
+; GFX11-NEXT: scratch_load_u16 v72, off, s32 offset:200
+; GFX11-NEXT: scratch_load_u16 v73, off, s32 offset:208
+; GFX11-NEXT: scratch_load_u16 v74, off, s32 offset:216
+; GFX11-NEXT: scratch_load_u16 v75, off, s32 offset:224
+; GFX11-NEXT: scratch_load_u16 v76, off, s32 offset:232
+; GFX11-NEXT: scratch_load_u16 v77, off, s32 offset:240
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: scratch_load_u16 v78, off, s32 offset:248
+; GFX11-NEXT: scratch_load_u16 v79, off, s32 offset:256
+; GFX11-NEXT: scratch_load_u16 v88, off, s32 offset:264
+; GFX11-NEXT: scratch_load_u16 v89, off, s32 offset:272
+; GFX11-NEXT: scratch_load_u16 v90, off, s32 offset:280
+; GFX11-NEXT: scratch_load_u16 v91, off, s32 offset:288
+; GFX11-NEXT: scratch_load_u16 v92, off, s32 offset:296
+; GFX11-NEXT: scratch_load_u16 v93, off, s32 offset:304
+; GFX11-NEXT: scratch_load_u16 v94, off, s32 offset:312
+; GFX11-NEXT: scratch_load_u16 v57, off, s32 offset:308
+; GFX11-NEXT: scratch_load_u16 v58, off, s32 offset:300
+; GFX11-NEXT: scratch_load_u16 v46, off, s32 offset:292
+; GFX11-NEXT: scratch_load_u16 v47, off, s32 offset:284
+; GFX11-NEXT: scratch_load_u16 v40, off, s32 offset:276
+; GFX11-NEXT: scratch_load_u16 v43, off, s32 offset:268
+; GFX11-NEXT: scratch_load_u16 v182, off, s32 offset:260
+; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:252
+; GFX11-NEXT: scratch_load_u16 v178, off, s32 offset:244
+; GFX11-NEXT: scratch_load_u16 v181, off, s32 offset:236
+; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:228
+; GFX11-NEXT: scratch_load_u16 v176, off, s32 offset:220
+; GFX11-NEXT: scratch_load_u16 v160, off, s32 offset:212
+; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:204
+; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:196
+; GFX11-NEXT: scratch_load_u16 v150, off, s32 offset:188
+; GFX11-NEXT: scratch_load_u16 v135, off, s32 offset:180
+; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:172
+; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:164
+; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:156
+; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:148
+; GFX11-NEXT: scratch_load_u16 v179, off, s32 offset:140
+; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:132
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:124
+; GFX11-NEXT: scratch_load_u16 v149, off, s32 offset:116
+; GFX11-NEXT: scratch_load_u16 v151, off, s32 offset:108
+; GFX11-NEXT: scratch_load_u16 v144, off, s32 offset:100
+; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:92
+; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:84
+; GFX11-NEXT: scratch_load_u16 v133, off, s32 offset:76
+; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:68
+; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:52
+; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:4
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v84, 8, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v83, 8, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v86, 8, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v85, 8, v29
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(62)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v97, 8, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v87, 8, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v101, 8, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v102, 8, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v113, 8, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v112, 8, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v130, 8, v14
+; GFX11-NEXT: v_lshlrev_b32_e32 v128, 8, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v134, 8, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v132, 8, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v161, 8, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v147, 8, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v166, 8, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v167, 8, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v180, 8, v30
+; GFX11-NEXT: v_lshlrev_b32_e32 v177, 8, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v42, 8, v41
+; GFX11-NEXT: s_waitcnt vmcnt(61)
+; GFX11-NEXT: v_lshlrev_b32_e32 v41, 8, v44
+; GFX11-NEXT: s_waitcnt vmcnt(60)
+; GFX11-NEXT: v_lshlrev_b32_e32 v45, 8, v45
+; GFX11-NEXT: s_waitcnt vmcnt(59)
+; GFX11-NEXT: v_lshlrev_b32_e32 v44, 8, v56
+; GFX11-NEXT: s_waitcnt vmcnt(58)
+; GFX11-NEXT: v_lshlrev_b32_e32 v59, 8, v59
+; GFX11-NEXT: s_waitcnt vmcnt(57)
+; GFX11-NEXT: v_lshlrev_b32_e32 v56, 8, v60
+; GFX11-NEXT: s_waitcnt vmcnt(56)
+; GFX11-NEXT: v_lshlrev_b32_e32 v60, 8, v61
+; GFX11-NEXT: s_waitcnt vmcnt(55)
+; GFX11-NEXT: v_lshlrev_b32_e32 v61, 8, v62
+; GFX11-NEXT: s_waitcnt vmcnt(54)
+; GFX11-NEXT: v_lshlrev_b32_e32 v63, 8, v63
+; GFX11-NEXT: s_waitcnt vmcnt(53)
+; GFX11-NEXT: v_lshlrev_b32_e32 v62, 8, v72
+; GFX11-NEXT: s_waitcnt vmcnt(52)
+; GFX11-NEXT: v_lshlrev_b32_e32 v73, 8, v73
+; GFX11-NEXT: s_waitcnt vmcnt(51)
+; GFX11-NEXT: v_lshlrev_b32_e32 v72, 8, v74
+; GFX11-NEXT: s_waitcnt vmcnt(50)
+; GFX11-NEXT: v_lshlrev_b32_e32 v75, 8, v75
+; GFX11-NEXT: s_waitcnt vmcnt(49)
+; GFX11-NEXT: v_lshlrev_b32_e32 v74, 8, v76
+; GFX11-NEXT: s_waitcnt vmcnt(48)
+; GFX11-NEXT: v_lshlrev_b32_e32 v77, 8, v77
+; GFX11-NEXT: s_waitcnt vmcnt(47)
+; GFX11-NEXT: v_lshlrev_b32_e32 v76, 8, v78
+; GFX11-NEXT: s_waitcnt vmcnt(46)
+; GFX11-NEXT: v_lshlrev_b32_e32 v78, 8, v79
+; GFX11-NEXT: s_waitcnt vmcnt(45)
+; GFX11-NEXT: v_lshlrev_b32_e32 v79, 8, v88
+; GFX11-NEXT: s_waitcnt vmcnt(44)
+; GFX11-NEXT: v_lshlrev_b32_e32 v89, 8, v89
+; GFX11-NEXT: s_waitcnt vmcnt(43)
+; GFX11-NEXT: v_lshlrev_b32_e32 v88, 8, v90
+; GFX11-NEXT: s_waitcnt vmcnt(42)
+; GFX11-NEXT: v_lshlrev_b32_e32 v91, 8, v91
+; GFX11-NEXT: s_waitcnt vmcnt(41)
+; GFX11-NEXT: v_lshlrev_b32_e32 v90, 8, v92
+; GFX11-NEXT: s_waitcnt vmcnt(40)
+; GFX11-NEXT: v_lshlrev_b32_e32 v92, 8, v93
+; GFX11-NEXT: s_waitcnt vmcnt(39)
+; GFX11-NEXT: v_lshlrev_b32_e32 v93, 8, v94
+; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT: s_cbranch_scc0 .LBB97_4
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-NEXT: s_and_b32 s5, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s29, 8
+; GFX11-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-NEXT: v_and_b32_e64 v5, 0xffff, s5
+; GFX11-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-NEXT: s_and_b32 s11, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-NEXT: s_or_b32 s9, s9, s10
+; GFX11-NEXT: s_or_b32 s10, s11, s12
+; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-NEXT: s_pack_ll_b32_b16 s8, s9, s10
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v36
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v32
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v34
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v33
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v68
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v64
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v66
+; GFX11-NEXT: v_or_b32_e32 v6, v4, v67
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v65
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v38
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v39
+; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v49
+; GFX11-NEXT: v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v37
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v50
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v71
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v48
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v69
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v82
+; GFX11-NEXT: v_or_b32_e32 v9, v7, v80
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_or_b32_e32 v10, v8, v81
+; GFX11-NEXT: v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v53
+; GFX11-NEXT: v_lshl_or_b32 v8, v9, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v55
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v51
+; GFX11-NEXT: v_lshl_or_b32 v9, v10, 16, v3
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v84
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v52
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v54
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v86
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v83
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v96
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v85
+; GFX11-NEXT: v_or_b32_e32 v12, v10, v97
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshl_or_b32 v10, v1, 16, v0
+; GFX11-NEXT: v_or_b32_e32 v0, v11, v87
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v99
+; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-NEXT: v_lshl_or_b32 v11, v3, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v103
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v114
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v98
+; GFX11-NEXT: v_lshl_or_b32 v12, v0, 16, v12
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v100
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v113
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v101
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v116
+; GFX11-NEXT: v_or_b32_e32 v17, v14, v128
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v112
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v117
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v102
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_or_b32_e32 v13, v13, v130
+; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v133
+; GFX11-NEXT: v_or_b32_e32 v20, v14, v132
+; GFX11-NEXT: v_lshl_or_b32 v14, v0, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v148
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v119
+; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v129
+; GFX11-NEXT: v_or_b32_e32 v16, v16, v161
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v13
+; GFX11-NEXT: v_lshl_or_b32 v13, v2, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v166
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v144
+; GFX11-NEXT: v_or_b32_e32 v15, v15, v134
+; GFX11-NEXT: v_or_b32_e32 v18, v18, v147
+; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v16
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v167
+; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v15
+; GFX11-NEXT: v_lshl_or_b32 v15, v17, 16, v19
+; GFX11-NEXT: v_lshl_or_b32 v17, v18, 16, v22
+; GFX11-NEXT: v_mov_b32_e32 v2, s7
+; GFX11-NEXT: v_lshl_or_b32 v18, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v151
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v149
+; GFX11-NEXT: v_lshl_or_b32 v16, v20, 16, v21
+; GFX11-NEXT: v_mov_b32_e32 v3, s8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v180
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v177
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v19, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v165
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v162
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v42
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v41
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v20, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v179
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v115
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v45
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v44
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v21, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v131
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v118
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v59
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v56
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v22, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v145
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v135
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v60
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v61
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v23, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v150
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v146
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v63
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v62
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v24, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v163
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v160
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v73
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v72
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v25, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v176
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v164
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v75
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v74
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v26, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v181
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v178
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v77
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v76
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v27, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v183
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v182
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v78
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v79
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v28, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v43
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v40
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v89
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v88
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v29, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v47
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v46
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v91
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v90
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v30, v1, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v58
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v57
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v92
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v93
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v31, v1, 16, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_vccnz .LBB97_3
+; GFX11-NEXT: .LBB97_2: ; %cmp.true
+; GFX11-NEXT: s_add_i32 s28, s28, 3
+; GFX11-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-NEXT: s_add_i32 s24, s24, 3
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-NEXT: s_add_i32 s26, s26, 3
+; GFX11-NEXT: s_or_b32 s5, s6, s5
+; GFX11-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-NEXT: s_add_i32 s20, s20, 3
+; GFX11-NEXT: s_or_b32 s6, s7, s6
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_add_i32 s22, s22, 3
+; GFX11-NEXT: s_or_b32 s7, s8, s7
+; GFX11-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-NEXT: s_add_i32 s16, s16, 3
+; GFX11-NEXT: s_or_b32 s8, s9, s8
+; GFX11-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-NEXT: s_add_i32 s18, s18, 3
+; GFX11-NEXT: s_add_i32 s0, s0, 3
+; GFX11-NEXT: s_add_i32 s2, s2, 3
+; GFX11-NEXT: s_or_b32 s9, s10, s9
+; GFX11-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-NEXT: s_or_b32 s10, s11, s10
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s2
+; GFX11-NEXT: s_addk_i32 s9, 0x300
+; GFX11-NEXT: s_addk_i32 s0, 0x300
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_addk_i32 s10, 0x300
+; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-NEXT: s_waitcnt vmcnt(37)
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v58
+; GFX11-NEXT: s_addk_i32 s5, 0x300
+; GFX11-NEXT: s_addk_i32 s6, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v57
+; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-NEXT: s_waitcnt vmcnt(35)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v47
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: s_addk_i32 s7, 0x300
+; GFX11-NEXT: s_addk_i32 s8, 0x300
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v92, v0
+; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v46
+; GFX11-NEXT: v_or_b32_e32 v1, v93, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v91, v2
+; GFX11-NEXT: s_waitcnt vmcnt(33)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v43
+; GFX11-NEXT: v_add_nc_u32_e32 v27, 0x300, v0
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v40
+; GFX11-NEXT: v_add_nc_u32_e32 v31, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v28, 0x300, v2
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v4
+; GFX11-NEXT: v_or_b32_e32 v3, v90, v3
+; GFX11-NEXT: s_waitcnt vmcnt(31)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v183
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v182
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v89, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v30, 0x300, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: s_waitcnt vmcnt(29)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v181
+; GFX11-NEXT: v_or_b32_e32 v0, v88, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v78, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v79, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v29, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v178
+; GFX11-NEXT: v_add_nc_u32_e32 v182, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v178, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v77, v3
+; GFX11-NEXT: s_waitcnt vmcnt(27)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v176
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v164
+; GFX11-NEXT: s_waitcnt vmcnt(25)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v163
+; GFX11-NEXT: v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v76, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v160
+; GFX11-NEXT: v_or_b32_e32 v1, v75, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v160, 0x300, v0
+; GFX11-NEXT: v_or_b32_e32 v0, v74, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v73, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v1
+; GFX11-NEXT: s_waitcnt vmcnt(23)
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v150
+; GFX11-NEXT: v_add_nc_u32_e32 v26, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v72, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v146
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: s_waitcnt vmcnt(21)
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v145
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v135
+; GFX11-NEXT: v_add_nc_u32_e32 v25, 0x300, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v63, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: s_waitcnt vmcnt(19)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v131
+; GFX11-NEXT: v_or_b32_e32 v0, v62, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v131, 0x300, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v60, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v61, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v24, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v118
+; GFX11-NEXT: v_add_nc_u32_e32 v135, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v118, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v59, v3
+; GFX11-NEXT: s_waitcnt vmcnt(17)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v179
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v115
+; GFX11-NEXT: s_waitcnt vmcnt(15)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v165
+; GFX11-NEXT: v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v56, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v162
+; GFX11-NEXT: v_or_b32_e32 v1, v45, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-NEXT: v_or_b32_e32 v0, v44, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v42, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v1
+; GFX11-NEXT: s_waitcnt vmcnt(13)
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v151
+; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v41, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v149
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: s_waitcnt vmcnt(11)
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v148
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v144
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x300, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v180, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: s_waitcnt vmcnt(9)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v133
+; GFX11-NEXT: v_or_b32_e32 v0, v177, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x300, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v166, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v167, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v133, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v129
+; GFX11-NEXT: v_add_nc_u32_e32 v129, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v144, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v161, v3
+; GFX11-NEXT: s_waitcnt vmcnt(7)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v119
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v117
+; GFX11-NEXT: s_waitcnt vmcnt(5)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v116
+; GFX11-NEXT: v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v0, v147, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v114
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v99
+; GFX11-NEXT: v_add_nc_u32_e32 v114, 0x300, v0
+; GFX11-NEXT: v_or_b32_e32 v0, v132, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v130, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v103
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v98
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v54
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v53
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v39
+; GFX11-NEXT: v_add_nc_u32_e32 v33, 3, v33
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x300, v0
+; GFX11-NEXT: v_or_b32_e32 v4, v113, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v2, v128, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v100
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-NEXT: v_or_b32_e32 v4, v101, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v102, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v96
+; GFX11-NEXT: v_or_b32_e32 v1, v134, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v97, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v55
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v52
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x300, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v87, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v51
+; GFX11-NEXT: v_or_b32_e32 v4, v86, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v85, v6
+; GFX11-NEXT: v_or_b32_e32 v6, v84, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v51, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v50
+; GFX11-NEXT: v_add_nc_u32_e32 v50, 0x300, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v49
+; GFX11-NEXT: v_or_b32_e32 v5, v83, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v48
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v7
+; GFX11-NEXT: v_or_b32_e32 v4, v82, v4
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v38
+; GFX11-NEXT: v_or_b32_e32 v5, v81, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v38, 0x300, v4
+; GFX11-NEXT: v_or_b32_e32 v4, v71, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v80, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v37
+; GFX11-NEXT: v_add_nc_u32_e32 v37, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v39, 0x300, v7
+; GFX11-NEXT: v_or_b32_e32 v4, v70, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v36
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v34
+; GFX11-NEXT: v_add_nc_u32_e32 v34, 3, v35
+; GFX11-NEXT: v_add_nc_u32_e32 v35, 0x300, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v7
+; GFX11-NEXT: v_or_b32_e32 v5, v69, v5
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v34
+; GFX11-NEXT: v_or_b32_e32 v3, v112, v3
+; GFX11-NEXT: v_or_b32_e32 v4, v68, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v34, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v67, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v66, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v33
+; GFX11-NEXT: v_add_nc_u32_e32 v33, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v32
+; GFX11-NEXT: v_add_nc_u32_e32 v32, 0x300, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v65, v8
+; GFX11-NEXT: v_and_b32_e64 v8, 0xffff, s4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v6
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_or_b32_e32 v4, v64, v4
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v36
+; GFX11-NEXT: v_lshl_or_b32 v5, v7, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v33
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v37
+; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_lshl_or_b32 v6, v32, 16, v7
+; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v35
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v51
+; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v38
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-NEXT: v_lshl_or_b32 v7, v34, 16, v8
+; GFX11-NEXT: v_lshl_or_b32 v8, v39, 16, v33
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v12
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v11
+; GFX11-NEXT: v_lshl_or_b32 v11, v50, 16, v32
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v1
+; GFX11-NEXT: v_lshl_or_b32 v9, v9, 16, v35
+; GFX11-NEXT: v_lshl_or_b32 v12, v15, 16, v14
+; GFX11-NEXT: v_lshl_or_b32 v13, v13, 16, v33
+; GFX11-NEXT: v_lshl_or_b32 v14, v3, 16, v34
+; GFX11-NEXT: v_lshl_or_b32 v16, v16, 16, v32
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v116
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v129
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v18
+; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v17
+; GFX11-NEXT: v_lshl_or_b32 v26, v26, 16, v36
+; GFX11-NEXT: v_lshl_or_b32 v17, v114, 16, v32
+; GFX11-NEXT: v_lshl_or_b32 v18, v144, 16, v33
+; GFX11-NEXT: v_lshl_or_b32 v20, v20, 16, v34
+; GFX11-NEXT: v_lshl_or_b32 v21, v21, 16, v35
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v115
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v135
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v131
+; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v23
+; GFX11-NEXT: v_and_b32_e32 v36, 0xffff, v27
+; GFX11-NEXT: v_lshl_or_b32 v22, v145, 16, v32
+; GFX11-NEXT: v_lshl_or_b32 v23, v118, 16, v33
+; GFX11-NEXT: v_lshl_or_b32 v24, v24, 16, v34
+; GFX11-NEXT: v_lshl_or_b32 v25, v25, 16, v35
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff, v163
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v182
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff, v181
+; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v28
+; GFX11-NEXT: v_lshl_or_b32 v15, v2, 16, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: v_lshl_or_b32 v19, v133, 16, v19
+; GFX11-NEXT: v_lshl_or_b32 v27, v160, 16, v32
+; GFX11-NEXT: v_lshl_or_b32 v28, v178, 16, v33
+; GFX11-NEXT: v_lshl_or_b32 v29, v29, 16, v34
+; GFX11-NEXT: v_lshl_or_b32 v30, v30, 16, v35
+; GFX11-NEXT: v_lshl_or_b32 v31, v31, 16, v36
+; GFX11-NEXT: .LBB97_3: ; %end
+; GFX11-NEXT: s_clause 0x1e
+; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:320
+; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:324
+; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:328
+; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:332
+; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:336
+; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:340
+; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:344
+; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:348
+; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:352
+; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:356
+; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:360
+; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:364
+; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:368
+; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:372
+; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:376
+; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:380
+; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:384
+; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:388
+; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:392
+; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:396
+; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:400
+; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:404
+; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:408
+; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:412
+; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:416
+; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:420
+; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:424
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:428
+; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:432
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:436
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:440
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-NEXT: .LBB97_4:
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-NEXT: s_branch .LBB97_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index 38302a75fe26d..32ccaa73b3a8a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -6290,8 +6290,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_u16 v32, off, s32
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
@@ -6320,8 +6320,8 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v31
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -13309,8 +13309,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_u16 v32, off, s32
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
@@ -13339,8 +13339,8 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v31
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -19846,8 +19846,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_u16 v32, off, s32
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
@@ -19876,8 +19876,8 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v31
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -25873,8 +25873,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_u16 v32, off, s32
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
@@ -25903,8 +25903,8 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v31
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -31625,8 +31625,8 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_u16 v32, off, s32
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v13.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v11.l
@@ -31655,8 +31655,8 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v31.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v31
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -36525,8 +36525,8 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_u16 v32, off, s32
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v13.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v11.l
@@ -36555,8 +36555,8 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v31.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v31
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -40370,8 +40370,8 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_u16 v32, off, s32
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v13.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v11.l
@@ -40400,8 +40400,8 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v31.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v31
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 436b1a038b274..bb4fd7b6f1e88 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -5019,21 +5019,22 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x9
-; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v36, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v37, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v38, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v39, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v31, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v33, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v34, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l
@@ -5050,7 +5051,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v31.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h
@@ -5059,17 +5060,12 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v35
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -5119,14 +5115,14 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h
; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5
; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l
@@ -5147,10 +5143,10 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
@@ -5265,14 +5261,14 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v33.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l
@@ -11979,21 +11975,22 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x9
-; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v36, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v37, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v38, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v39, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v31, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v33, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v34, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l
@@ -12010,7 +12007,7 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v31.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h
@@ -12019,17 +12016,12 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v35
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -12079,14 +12071,14 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h
; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5
; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l
@@ -12107,10 +12099,10 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
@@ -12225,14 +12217,14 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v33.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9
; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l
@@ -18549,17 +18541,17 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x9
-; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v33, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v38, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v39, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v49, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v34, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v36, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v35, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v37, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v23.l
@@ -18592,21 +18584,22 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v29.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v29.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.l, 8, v28.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v31.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v38.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.l, 8, v39.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v48.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v37
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v49.l
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -18634,10 +18627,10 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v34.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v18.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v19.h
@@ -18651,13 +18644,13 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v29.l
; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v29.h
; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v30.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v34.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v32.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v33.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
@@ -18674,10 +18667,10 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
@@ -18691,33 +18684,33 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v34.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v35.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v35.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v32.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v33.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v31.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v32.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v33.h, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v1.l
@@ -18734,7 +18727,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v23.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v28.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v31.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v29.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v0.l
@@ -24630,17 +24623,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x9
-; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v29.l
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v33, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v38, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v39, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v49, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v34, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v36, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v35, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v37, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v23.l
@@ -24673,21 +24666,22 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v29.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v29.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v35.h
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.l, 8, v28.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v31.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v33.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v38.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.l, 8, v39.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v48.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v36.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v37
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v49.l
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB62_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -24715,10 +24709,10 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v34.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v18.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v19.h
@@ -24732,13 +24726,13 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v29.l
; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v29.h
; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v30.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v34.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v32.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v33.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
@@ -24755,10 +24749,10 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
@@ -24772,33 +24766,33 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2
; GFX11-TRUE16-NEXT: .LBB62_4: ; %cmp.true
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v34.l, 3
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v35.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v35.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v34.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v35.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v32.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v33.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v31.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v32.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v33.h, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v1.l
@@ -24815,7 +24809,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v23.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v28.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v31.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v29.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v0.l
@@ -28750,20 +28744,24 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x9
-; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v39, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v49, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v50, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v51, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v34, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v35, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v36, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v37, off, s32 offset:4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v27.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v25.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l
@@ -28773,12 +28771,12 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v7.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l
@@ -28786,23 +28784,18 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v37.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v34.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v38
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -28810,23 +28803,22 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_4
; GFX11-TRUE16-NEXT: .LBB72_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB72_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v32.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v31.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v31.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l
@@ -28851,21 +28843,20 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h
; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4
; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v34.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5
; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
@@ -28880,14 +28871,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
@@ -28921,8 +28912,8 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2
; GFX11-TRUE16-NEXT: .LBB72_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
@@ -28931,10 +28922,10 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v32.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v32.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v31.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v31.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
@@ -28997,16 +28988,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v36.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v34.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h
@@ -32861,20 +32851,24 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x9
-; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v39, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v49, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v50, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v51, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v34, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v35, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v36, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v37, off, s32 offset:4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v27.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v25.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l
@@ -32884,12 +32878,12 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v7.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l
@@ -32897,23 +32891,18 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v37.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v34.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v38
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -32921,23 +32910,22 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_4
; GFX11-TRUE16-NEXT: .LBB76_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB76_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v32.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v32.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v31.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v31.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l
@@ -32962,21 +32950,20 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h
; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4
; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v34.l
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5
; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
@@ -32991,14 +32978,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
@@ -33032,8 +33019,8 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB76_2
; GFX11-TRUE16-NEXT: .LBB76_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v33.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
@@ -33042,10 +33029,10 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v32.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v32.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v31.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v31.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
@@ -33108,16 +33095,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v36.l, 3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v34.l, 3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 35d135b123969..fd62d4087c460 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -12446,107 +12446,107 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v80, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_u16 v81, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_u16 v49, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_u16 v82, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_u16 v50, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_u16 v83, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_u16 v51, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_u16 v84, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_u16 v52, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_u16 v85, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_u16 v87, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v96, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v97, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v98, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v99, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v100, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_u16 v101, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_u16 v102, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_u16 v103, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_u16 v112, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_u16 v113, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_u16 v53, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_u16 v54, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_u16 v55, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_u16 v64, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_u16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_u16 v66, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_u16 v67, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v68, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v69, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
+; GFX11-TRUE16-NEXT: scratch_load_u16 v70, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v71, off, s32 offset:4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v48.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v80.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v81.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v82.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v83.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v84.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v96.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v98.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v99.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v100.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v101.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v102.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v103.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v112.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v86
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -12557,95 +12557,95 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v0.l, v38.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v80.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v80, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v1.h, v37.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v80, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v2.l, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v80, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v3.l, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v80, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v4.l, v31.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v70.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v80, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v5.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v68.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v80, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v6.l, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v67.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v80, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v80, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v80, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v9.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v51.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v80, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v10.l, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
@@ -12654,32 +12654,32 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
@@ -12697,199 +12697,199 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v80, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v80, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v12.l, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v80, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v80, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v14.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v80, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v15.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v80, v15
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v38.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v39.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v38.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v37.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v33.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v36, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v35.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v35.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v36, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v36.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v34.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v34.h, v2.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v36, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v31.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v32.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v36, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v36, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v36, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v71.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v70.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v36, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v68.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v36, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v66.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v36, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v64.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v36, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v55.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v54.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v36, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v52.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v36, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v51.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v36, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v36, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v36, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v36, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -14669,775 +14669,390 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_branch .LBB27_2
;
-; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16i32_scalar:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v8, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v10, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB27_4
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v31
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v32
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v38
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v83
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v85
-; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v68
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v69
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v71
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v80
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v81
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v65
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v39
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v55
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v87
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v86
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB27_3
-; GFX11-TRUE16-NEXT: .LBB27_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v31
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v32
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v38
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v83, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v84, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v85, v2
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v82, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v68, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v69, v11
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v52
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v70, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v71, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v80, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v65, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v66, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v67, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: .LBB27_3: ; %end
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB27_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_branch .LBB27_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v16i32_scalar:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB27_4
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v38
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v83
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v85
-; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v68
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v69
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v37
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v18
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v80
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v81
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v65
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v67
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v28
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v30
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v51
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v50
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v49
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v48
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v39
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v55
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v25
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v87
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v96, v14
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v86
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB27_3
-; GFX11-FAKE16-NEXT: .LBB27_2: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v31
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v32
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v38
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v33
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v84, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v85, v2
-; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v82, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v68, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v69, v11
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s29, 8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v34
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v36
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v37
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v18
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v52
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v70, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v80, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v65, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v67, v9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v28
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v30
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v50
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v49
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v48
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v39
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v27, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v29, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v54, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v55, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v17, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v19, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v21, v14
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v23, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v14
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: .LBB27_3: ; %end
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT: .LBB27_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT: s_branch .LBB27_2
+; GFX11-LABEL: bitcast_v64i8_to_v16i32_scalar:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
+; GFX11-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
+; GFX11-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v4, off, s32
+; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v48, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v49, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v50, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v51, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v52, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v53, off, s32 offset:4
+; GFX11-NEXT: v_lshlrev_b32_e32 v83, 8, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v84, 8, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v85, 8, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 8, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v29
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(15)
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v0
+; GFX11-NEXT: s_waitcnt vmcnt(13)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: s_waitcnt vmcnt(12)
+; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v4
+; GFX11-NEXT: s_waitcnt vmcnt(11)
+; GFX11-NEXT: v_lshlrev_b32_e32 v55, 8, v6
+; GFX11-NEXT: s_waitcnt vmcnt(10)
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v8
+; GFX11-NEXT: s_waitcnt vmcnt(9)
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v10
+; GFX11-NEXT: s_waitcnt vmcnt(8)
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v12
+; GFX11-NEXT: s_waitcnt vmcnt(7)
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v14
+; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT: s_cbranch_scc0 .LBB27_4
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-NEXT: s_lshl_b32 s7, s17, 8
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_and_b32 s6, s16, 0xff
+; GFX11-NEXT: s_and_b32 s8, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s19, 8
+; GFX11-NEXT: s_or_b32 s6, s6, s7
+; GFX11-NEXT: s_or_b32 s7, s8, s9
+; GFX11-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX11-NEXT: s_lshl_b32 s7, s7, 16
+; GFX11-NEXT: s_and_b32 s8, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s21, 8
+; GFX11-NEXT: s_or_b32 s6, s6, s7
+; GFX11-NEXT: s_or_b32 s7, s8, s9
+; GFX11-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-NEXT: s_or_b32 s8, s8, s9
+; GFX11-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT: s_lshl_b32 s8, s8, 16
+; GFX11-NEXT: s_or_b32 s9, s9, s10
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_and_b32 s8, s9, 0xffff
+; GFX11-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s27, 8
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31
+; GFX11-NEXT: s_or_b32 s9, s9, s10
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32
+; GFX11-NEXT: s_lshl_b32 s9, s9, 16
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v38
+; GFX11-NEXT: s_or_b32 s8, s8, s9
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v83
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v84
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v22
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v24
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v85
+; GFX11-NEXT: s_and_b32 s11, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s12, s29, 8
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_or_b32_e32 v6, v6, v82
+; GFX11-NEXT: v_or_b32_e32 v10, v10, v68
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v69
+; GFX11-NEXT: s_or_b32 s10, s11, s12
+; GFX11-NEXT: v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT: s_and_b32 s10, s10, 0xffff
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT: v_or_b32_e32 v4, s10, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v36
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v37
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v16
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v18
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v20
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v71
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v80
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v81
+; GFX11-NEXT: v_or_b32_e32 v6, v6, v65
+; GFX11-NEXT: v_or_b32_e32 v7, v7, v66
+; GFX11-NEXT: v_or_b32_e32 v9, v9, v67
+; GFX11-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v52
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v64
+; GFX11-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT: v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v26
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v28
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v53
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v51
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v50
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v49
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v48
+; GFX11-NEXT: v_and_b32_e32 v86, 0xff, v39
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v27
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v29
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v54
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v55
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v17
+; GFX11-NEXT: v_or_b32_e32 v12, v12, v19
+; GFX11-NEXT: v_or_b32_e32 v14, v14, v21
+; GFX11-NEXT: v_or_b32_e32 v15, v15, v23
+; GFX11-NEXT: v_or_b32_e32 v86, v86, v25
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v87, 16, v11
+; GFX11-NEXT: v_and_b32_e32 v96, 0xffff, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-NEXT: v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v13, v13, v87
+; GFX11-NEXT: v_or_b32_e32 v14, v96, v14
+; GFX11-NEXT: v_or_b32_e32 v15, v15, v86
+; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_vccnz .LBB27_3
+; GFX11-NEXT: .LBB27_2: ; %cmp.true
+; GFX11-NEXT: s_add_i32 s0, s0, 3
+; GFX11-NEXT: s_add_i32 s2, s2, 3
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s2
+; GFX11-NEXT: s_addk_i32 s0, 0x300
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-NEXT: s_add_i32 s16, s16, 3
+; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: s_and_b32 s1, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s2, s17, 8
+; GFX11-NEXT: s_add_i32 s18, s18, 3
+; GFX11-NEXT: s_or_b32 s1, s2, s1
+; GFX11-NEXT: s_and_b32 s2, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s19, 8
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_add_i32 s20, s20, 3
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_and_b32 s3, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s21, 8
+; GFX11-NEXT: s_add_i32 s22, s22, 3
+; GFX11-NEXT: s_or_b32 s1, s1, s2
+; GFX11-NEXT: s_or_b32 s2, s4, s3
+; GFX11-NEXT: s_and_b32 s3, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s23, 8
+; GFX11-NEXT: s_add_i32 s24, s24, 3
+; GFX11-NEXT: s_or_b32 s3, s4, s3
+; GFX11-NEXT: s_and_b32 s4, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s25, 8
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_addk_i32 s3, 0x300
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: s_add_i32 s26, s26, 3
+; GFX11-NEXT: s_or_b32 s2, s2, s3
+; GFX11-NEXT: s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT: s_and_b32 s4, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s27, 8
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v31
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v32
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v38
+; GFX11-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: s_or_b32 s3, s3, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v33
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v24
+; GFX11-NEXT: v_or_b32_e32 v0, v83, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v84, v1
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_or_b32_e32 v2, v85, v2
+; GFX11-NEXT: s_add_i32 s28, s28, 3
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_or_b32_e32 v6, v82, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v10, v68, v10
+; GFX11-NEXT: v_or_b32_e32 v11, v69, v11
+; GFX11-NEXT: s_and_b32 s6, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s7, s29, 8
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_or_b32 s5, s7, s6
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: s_addk_i32 s5, 0x300
+; GFX11-NEXT: v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v35
+; GFX11-NEXT: v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v34
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v36
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v37
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v16
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v18
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v20
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX11-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v52
+; GFX11-NEXT: v_or_b32_e32 v0, v70, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v71, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v80, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v81, v3
+; GFX11-NEXT: v_or_b32_e32 v6, v65, v6
+; GFX11-NEXT: v_or_b32_e32 v7, v66, v7
+; GFX11-NEXT: v_or_b32_e32 v9, v67, v9
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x300, v9
+; GFX11-NEXT: v_or_b32_e32 v11, v64, v11
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT: v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v26
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v28
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v53
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v51
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v50
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v49
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v48
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v39
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT: v_or_b32_e32 v0, v27, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v29, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v54, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v55, v3
+; GFX11-NEXT: v_or_b32_e32 v11, v17, v11
+; GFX11-NEXT: v_or_b32_e32 v12, v19, v12
+; GFX11-NEXT: v_or_b32_e32 v14, v21, v14
+; GFX11-NEXT: v_or_b32_e32 v15, v23, v15
+; GFX11-NEXT: v_or_b32_e32 v16, v25, v16
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x300, v14
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v11
+; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v13, v13, v17
+; GFX11-NEXT: v_or_b32_e32 v14, v18, v14
+; GFX11-NEXT: v_or_b32_e32 v15, v15, v16
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: .LBB27_3: ; %end
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-NEXT: .LBB27_4:
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT: s_branch .LBB27_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -27320,107 +26935,107 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v80, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_u16 v81, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_u16 v49, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_u16 v82, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_u16 v50, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_u16 v83, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_u16 v51, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_u16 v84, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_u16 v52, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_u16 v85, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_u16 v87, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v96, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v97, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v98, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v99, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v100, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_u16 v101, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_u16 v102, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_u16 v103, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_u16 v112, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_u16 v113, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_u16 v53, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_u16 v54, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_u16 v55, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_u16 v64, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_u16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_u16 v66, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_u16 v67, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v68, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v69, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
+; GFX11-TRUE16-NEXT: scratch_load_u16 v70, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v71, off, s32 offset:4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v48.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v80.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v81.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v82.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v83.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v84.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v96.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v98.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v99.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v100.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v101.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v102.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v103.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v112.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v86
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -27431,95 +27046,95 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v0.l, v38.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v80.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v80, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v1.h, v37.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v80, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v2.l, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v80, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v3.l, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v80, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v4.l, v31.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v70.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v80, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v5.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v68.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v80, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v6.l, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v67.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v80, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v80, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v80, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v9.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v51.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v80, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v10.l, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
@@ -27528,32 +27143,32 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
@@ -27571,199 +27186,199 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v80, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v80, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v12.l, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v80, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v80, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v14.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v80, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v15.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v80, v15
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v38.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v39.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v38.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v37.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v33.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v36, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v35.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v35.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v36, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v36.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v34.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v34.h, v2.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v36, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v31.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v32.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v36, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v36, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v36, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v71.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v70.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v36, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v68.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v36, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v66.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v36, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v64.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v36, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v55.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v54.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v36, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v52.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v36, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v51.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v36, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v36, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v36, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v36, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -29543,775 +29158,390 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_branch .LBB51_2
;
-; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16f32_scalar:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v8, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v10, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v31
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v32
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v38
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v83
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v85
-; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v68
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v69
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v71
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v80
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v81
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v65
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v39
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v55
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v87
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v86
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB51_3
-; GFX11-TRUE16-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v31
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v32
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v38
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v83, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v84, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v85, v2
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v82, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v68, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v69, v11
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v52
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v70, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v71, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v80, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v65, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v66, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v67, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: .LBB51_3: ; %end
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB51_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_branch .LBB51_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v16f32_scalar:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB51_4
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v38
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v83
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v85
-; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v68
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v69
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v37
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v18
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v80
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v81
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v65
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v67
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v28
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v30
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v51
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v50
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v49
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v48
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v39
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v55
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v25
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v87
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v96, v14
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v86
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB51_3
-; GFX11-FAKE16-NEXT: .LBB51_2: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v31
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v32
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v38
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v33
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v84, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v85, v2
-; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v82, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v68, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v69, v11
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s29, 8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v34
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v36
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v37
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v18
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v52
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v70, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v80, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v65, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v67, v9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v28
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v30
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v50
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v49
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v48
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v39
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v27, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v29, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v54, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v55, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v17, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v19, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v21, v14
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v23, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v14
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: .LBB51_3: ; %end
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT: .LBB51_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT: s_branch .LBB51_2
+; GFX11-LABEL: bitcast_v64i8_to_v16f32_scalar:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
+; GFX11-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
+; GFX11-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v4, off, s32
+; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v48, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v49, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v50, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v51, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v52, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v53, off, s32 offset:4
+; GFX11-NEXT: v_lshlrev_b32_e32 v83, 8, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v84, 8, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v85, 8, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 8, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v29
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(15)
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v0
+; GFX11-NEXT: s_waitcnt vmcnt(13)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: s_waitcnt vmcnt(12)
+; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v4
+; GFX11-NEXT: s_waitcnt vmcnt(11)
+; GFX11-NEXT: v_lshlrev_b32_e32 v55, 8, v6
+; GFX11-NEXT: s_waitcnt vmcnt(10)
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v8
+; GFX11-NEXT: s_waitcnt vmcnt(9)
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v10
+; GFX11-NEXT: s_waitcnt vmcnt(8)
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v12
+; GFX11-NEXT: s_waitcnt vmcnt(7)
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v14
+; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT: s_cbranch_scc0 .LBB51_4
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-NEXT: s_lshl_b32 s7, s17, 8
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_and_b32 s6, s16, 0xff
+; GFX11-NEXT: s_and_b32 s8, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s19, 8
+; GFX11-NEXT: s_or_b32 s6, s6, s7
+; GFX11-NEXT: s_or_b32 s7, s8, s9
+; GFX11-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX11-NEXT: s_lshl_b32 s7, s7, 16
+; GFX11-NEXT: s_and_b32 s8, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s21, 8
+; GFX11-NEXT: s_or_b32 s6, s6, s7
+; GFX11-NEXT: s_or_b32 s7, s8, s9
+; GFX11-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-NEXT: s_or_b32 s8, s8, s9
+; GFX11-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT: s_lshl_b32 s8, s8, 16
+; GFX11-NEXT: s_or_b32 s9, s9, s10
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_and_b32 s8, s9, 0xffff
+; GFX11-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s27, 8
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31
+; GFX11-NEXT: s_or_b32 s9, s9, s10
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32
+; GFX11-NEXT: s_lshl_b32 s9, s9, 16
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v38
+; GFX11-NEXT: s_or_b32 s8, s8, s9
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v83
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v84
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v22
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v24
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v85
+; GFX11-NEXT: s_and_b32 s11, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s12, s29, 8
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_or_b32_e32 v6, v6, v82
+; GFX11-NEXT: v_or_b32_e32 v10, v10, v68
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v69
+; GFX11-NEXT: s_or_b32 s10, s11, s12
+; GFX11-NEXT: v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT: s_and_b32 s10, s10, 0xffff
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT: v_or_b32_e32 v4, s10, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v36
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v37
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v16
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v18
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v20
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v71
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v80
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v81
+; GFX11-NEXT: v_or_b32_e32 v6, v6, v65
+; GFX11-NEXT: v_or_b32_e32 v7, v7, v66
+; GFX11-NEXT: v_or_b32_e32 v9, v9, v67
+; GFX11-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v52
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v64
+; GFX11-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT: v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v26
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v28
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v53
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v51
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v50
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v49
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v48
+; GFX11-NEXT: v_and_b32_e32 v86, 0xff, v39
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v27
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v29
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v54
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v55
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v17
+; GFX11-NEXT: v_or_b32_e32 v12, v12, v19
+; GFX11-NEXT: v_or_b32_e32 v14, v14, v21
+; GFX11-NEXT: v_or_b32_e32 v15, v15, v23
+; GFX11-NEXT: v_or_b32_e32 v86, v86, v25
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v87, 16, v11
+; GFX11-NEXT: v_and_b32_e32 v96, 0xffff, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-NEXT: v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v13, v13, v87
+; GFX11-NEXT: v_or_b32_e32 v14, v96, v14
+; GFX11-NEXT: v_or_b32_e32 v15, v15, v86
+; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_vccnz .LBB51_3
+; GFX11-NEXT: .LBB51_2: ; %cmp.true
+; GFX11-NEXT: s_add_i32 s0, s0, 3
+; GFX11-NEXT: s_add_i32 s2, s2, 3
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s2
+; GFX11-NEXT: s_addk_i32 s0, 0x300
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-NEXT: s_add_i32 s16, s16, 3
+; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: s_and_b32 s1, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s2, s17, 8
+; GFX11-NEXT: s_add_i32 s18, s18, 3
+; GFX11-NEXT: s_or_b32 s1, s2, s1
+; GFX11-NEXT: s_and_b32 s2, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s19, 8
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_add_i32 s20, s20, 3
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_and_b32 s3, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s21, 8
+; GFX11-NEXT: s_add_i32 s22, s22, 3
+; GFX11-NEXT: s_or_b32 s1, s1, s2
+; GFX11-NEXT: s_or_b32 s2, s4, s3
+; GFX11-NEXT: s_and_b32 s3, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s23, 8
+; GFX11-NEXT: s_add_i32 s24, s24, 3
+; GFX11-NEXT: s_or_b32 s3, s4, s3
+; GFX11-NEXT: s_and_b32 s4, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s25, 8
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_addk_i32 s3, 0x300
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: s_add_i32 s26, s26, 3
+; GFX11-NEXT: s_or_b32 s2, s2, s3
+; GFX11-NEXT: s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT: s_and_b32 s4, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s27, 8
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v31
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v32
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v38
+; GFX11-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: s_or_b32 s3, s3, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v33
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v24
+; GFX11-NEXT: v_or_b32_e32 v0, v83, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v84, v1
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_or_b32_e32 v2, v85, v2
+; GFX11-NEXT: s_add_i32 s28, s28, 3
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_or_b32_e32 v6, v82, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v10, v68, v10
+; GFX11-NEXT: v_or_b32_e32 v11, v69, v11
+; GFX11-NEXT: s_and_b32 s6, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s7, s29, 8
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_or_b32 s5, s7, s6
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: s_addk_i32 s5, 0x300
+; GFX11-NEXT: v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v35
+; GFX11-NEXT: v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v34
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v36
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v37
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v16
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v18
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v20
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX11-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v52
+; GFX11-NEXT: v_or_b32_e32 v0, v70, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v71, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v80, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v81, v3
+; GFX11-NEXT: v_or_b32_e32 v6, v65, v6
+; GFX11-NEXT: v_or_b32_e32 v7, v66, v7
+; GFX11-NEXT: v_or_b32_e32 v9, v67, v9
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x300, v9
+; GFX11-NEXT: v_or_b32_e32 v11, v64, v11
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT: v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v26
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v28
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v53
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v51
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v50
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v49
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v48
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v39
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT: v_or_b32_e32 v0, v27, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v29, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v54, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v55, v3
+; GFX11-NEXT: v_or_b32_e32 v11, v17, v11
+; GFX11-NEXT: v_or_b32_e32 v12, v19, v12
+; GFX11-NEXT: v_or_b32_e32 v14, v21, v14
+; GFX11-NEXT: v_or_b32_e32 v15, v23, v15
+; GFX11-NEXT: v_or_b32_e32 v16, v25, v16
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x300, v14
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v11
+; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v13, v13, v17
+; GFX11-NEXT: v_or_b32_e32 v14, v18, v14
+; GFX11-NEXT: v_or_b32_e32 v15, v15, v16
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: .LBB51_3: ; %end
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-NEXT: .LBB51_4:
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT: s_branch .LBB51_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -41466,107 +40696,107 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v80, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_u16 v81, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_u16 v49, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_u16 v82, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_u16 v50, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_u16 v83, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_u16 v51, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_u16 v84, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_u16 v52, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_u16 v85, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_u16 v87, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v96, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v97, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v98, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v99, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v100, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_u16 v101, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_u16 v102, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_u16 v103, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_u16 v112, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_u16 v113, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_u16 v53, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_u16 v54, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_u16 v55, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_u16 v64, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_u16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_u16 v66, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_u16 v67, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v68, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v69, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
+; GFX11-TRUE16-NEXT: scratch_load_u16 v70, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v71, off, s32 offset:4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v48.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v80.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v81.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v82.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v83.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v84.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v96.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v98.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v99.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v100.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v101.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v102.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v103.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v112.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v86
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -41577,95 +40807,95 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v0.l, v38.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v80.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v80, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v1.h, v37.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v80, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v2.l, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v80, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v3.l, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v80, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v4.l, v31.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v70.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v80, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v5.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v68.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v80, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v6.l, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v67.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v80, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v80, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v80, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v9.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v51.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v80, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v10.l, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
@@ -41674,32 +40904,32 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
@@ -41717,199 +40947,199 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v80, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v80, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v12.l, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v80, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v80, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v14.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v80, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v15.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v80, v15
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v38.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v39.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v38.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v37.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v33.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v36, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v35.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v35.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v36, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v36.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v34.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v34.h, v2.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v36, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v31.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v32.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v36, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v36, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v36, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v71.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v70.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v36, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v68.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v36, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v66.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v36, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v64.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v36, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v55.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v54.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v36, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v52.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v36, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v51.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v36, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v36, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v36, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v36, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -43689,775 +42919,390 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_branch .LBB71_2
;
-; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8i64_scalar:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v8, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v10, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB71_4
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v31
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v32
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v38
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v83
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v85
-; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v68
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v69
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v71
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v80
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v81
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v65
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v39
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v55
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v87
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v86
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB71_3
-; GFX11-TRUE16-NEXT: .LBB71_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v31
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v32
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v38
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v83, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v84, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v85, v2
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v82, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v68, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v69, v11
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v52
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v70, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v71, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v80, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v65, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v66, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v67, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: .LBB71_3: ; %end
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB71_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_branch .LBB71_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v8i64_scalar:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB71_4
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v38
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v83
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v85
-; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v68
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v69
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v37
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v18
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v80
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v81
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v65
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v67
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v28
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v30
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v51
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v50
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v49
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v48
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v39
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v55
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v25
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v87
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v96, v14
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v86
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB71_3
-; GFX11-FAKE16-NEXT: .LBB71_2: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v31
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v32
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v38
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v33
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v84, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v85, v2
-; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v82, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v68, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v69, v11
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s29, 8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v34
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v36
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v37
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v18
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v52
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v70, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v80, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v65, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v67, v9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v28
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v30
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v50
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v49
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v48
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v39
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v27, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v29, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v54, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v55, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v17, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v19, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v21, v14
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v23, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v14
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: .LBB71_3: ; %end
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT: .LBB71_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT: s_branch .LBB71_2
+; GFX11-LABEL: bitcast_v64i8_to_v8i64_scalar:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
+; GFX11-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
+; GFX11-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v4, off, s32
+; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v48, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v49, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v50, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v51, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v52, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v53, off, s32 offset:4
+; GFX11-NEXT: v_lshlrev_b32_e32 v83, 8, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v84, 8, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v85, 8, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 8, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v29
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(15)
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v0
+; GFX11-NEXT: s_waitcnt vmcnt(13)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: s_waitcnt vmcnt(12)
+; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v4
+; GFX11-NEXT: s_waitcnt vmcnt(11)
+; GFX11-NEXT: v_lshlrev_b32_e32 v55, 8, v6
+; GFX11-NEXT: s_waitcnt vmcnt(10)
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v8
+; GFX11-NEXT: s_waitcnt vmcnt(9)
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v10
+; GFX11-NEXT: s_waitcnt vmcnt(8)
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v12
+; GFX11-NEXT: s_waitcnt vmcnt(7)
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v14
+; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT: s_cbranch_scc0 .LBB71_4
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-NEXT: s_lshl_b32 s7, s17, 8
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_and_b32 s6, s16, 0xff
+; GFX11-NEXT: s_and_b32 s8, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s19, 8
+; GFX11-NEXT: s_or_b32 s6, s6, s7
+; GFX11-NEXT: s_or_b32 s7, s8, s9
+; GFX11-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX11-NEXT: s_lshl_b32 s7, s7, 16
+; GFX11-NEXT: s_and_b32 s8, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s21, 8
+; GFX11-NEXT: s_or_b32 s6, s6, s7
+; GFX11-NEXT: s_or_b32 s7, s8, s9
+; GFX11-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-NEXT: s_or_b32 s8, s8, s9
+; GFX11-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT: s_lshl_b32 s8, s8, 16
+; GFX11-NEXT: s_or_b32 s9, s9, s10
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_and_b32 s8, s9, 0xffff
+; GFX11-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s27, 8
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31
+; GFX11-NEXT: s_or_b32 s9, s9, s10
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32
+; GFX11-NEXT: s_lshl_b32 s9, s9, 16
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v38
+; GFX11-NEXT: s_or_b32 s8, s8, s9
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v83
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v84
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v22
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v24
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v85
+; GFX11-NEXT: s_and_b32 s11, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s12, s29, 8
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_or_b32_e32 v6, v6, v82
+; GFX11-NEXT: v_or_b32_e32 v10, v10, v68
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v69
+; GFX11-NEXT: s_or_b32 s10, s11, s12
+; GFX11-NEXT: v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT: s_and_b32 s10, s10, 0xffff
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT: v_or_b32_e32 v4, s10, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v36
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v37
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v16
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v18
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v20
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v71
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v80
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v81
+; GFX11-NEXT: v_or_b32_e32 v6, v6, v65
+; GFX11-NEXT: v_or_b32_e32 v7, v7, v66
+; GFX11-NEXT: v_or_b32_e32 v9, v9, v67
+; GFX11-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v52
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v64
+; GFX11-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT: v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v26
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v28
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v53
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v51
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v50
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v49
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v48
+; GFX11-NEXT: v_and_b32_e32 v86, 0xff, v39
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v27
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v29
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v54
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v55
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v17
+; GFX11-NEXT: v_or_b32_e32 v12, v12, v19
+; GFX11-NEXT: v_or_b32_e32 v14, v14, v21
+; GFX11-NEXT: v_or_b32_e32 v15, v15, v23
+; GFX11-NEXT: v_or_b32_e32 v86, v86, v25
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v87, 16, v11
+; GFX11-NEXT: v_and_b32_e32 v96, 0xffff, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-NEXT: v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v13, v13, v87
+; GFX11-NEXT: v_or_b32_e32 v14, v96, v14
+; GFX11-NEXT: v_or_b32_e32 v15, v15, v86
+; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_vccnz .LBB71_3
+; GFX11-NEXT: .LBB71_2: ; %cmp.true
+; GFX11-NEXT: s_add_i32 s0, s0, 3
+; GFX11-NEXT: s_add_i32 s2, s2, 3
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s2
+; GFX11-NEXT: s_addk_i32 s0, 0x300
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-NEXT: s_add_i32 s16, s16, 3
+; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: s_and_b32 s1, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s2, s17, 8
+; GFX11-NEXT: s_add_i32 s18, s18, 3
+; GFX11-NEXT: s_or_b32 s1, s2, s1
+; GFX11-NEXT: s_and_b32 s2, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s19, 8
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_add_i32 s20, s20, 3
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_and_b32 s3, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s21, 8
+; GFX11-NEXT: s_add_i32 s22, s22, 3
+; GFX11-NEXT: s_or_b32 s1, s1, s2
+; GFX11-NEXT: s_or_b32 s2, s4, s3
+; GFX11-NEXT: s_and_b32 s3, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s23, 8
+; GFX11-NEXT: s_add_i32 s24, s24, 3
+; GFX11-NEXT: s_or_b32 s3, s4, s3
+; GFX11-NEXT: s_and_b32 s4, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s25, 8
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_addk_i32 s3, 0x300
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: s_add_i32 s26, s26, 3
+; GFX11-NEXT: s_or_b32 s2, s2, s3
+; GFX11-NEXT: s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT: s_and_b32 s4, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s27, 8
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v31
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v32
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v38
+; GFX11-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: s_or_b32 s3, s3, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v33
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v24
+; GFX11-NEXT: v_or_b32_e32 v0, v83, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v84, v1
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_or_b32_e32 v2, v85, v2
+; GFX11-NEXT: s_add_i32 s28, s28, 3
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_or_b32_e32 v6, v82, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v10, v68, v10
+; GFX11-NEXT: v_or_b32_e32 v11, v69, v11
+; GFX11-NEXT: s_and_b32 s6, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s7, s29, 8
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_or_b32 s5, s7, s6
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: s_addk_i32 s5, 0x300
+; GFX11-NEXT: v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v35
+; GFX11-NEXT: v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v34
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v36
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v37
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v16
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v18
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v20
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX11-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v52
+; GFX11-NEXT: v_or_b32_e32 v0, v70, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v71, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v80, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v81, v3
+; GFX11-NEXT: v_or_b32_e32 v6, v65, v6
+; GFX11-NEXT: v_or_b32_e32 v7, v66, v7
+; GFX11-NEXT: v_or_b32_e32 v9, v67, v9
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x300, v9
+; GFX11-NEXT: v_or_b32_e32 v11, v64, v11
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT: v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v26
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v28
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v53
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v51
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v50
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v49
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v48
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v39
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT: v_or_b32_e32 v0, v27, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v29, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v54, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v55, v3
+; GFX11-NEXT: v_or_b32_e32 v11, v17, v11
+; GFX11-NEXT: v_or_b32_e32 v12, v19, v12
+; GFX11-NEXT: v_or_b32_e32 v14, v21, v14
+; GFX11-NEXT: v_or_b32_e32 v15, v23, v15
+; GFX11-NEXT: v_or_b32_e32 v16, v25, v16
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x300, v14
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v11
+; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v13, v13, v17
+; GFX11-NEXT: v_or_b32_e32 v14, v18, v14
+; GFX11-NEXT: v_or_b32_e32 v15, v15, v16
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: .LBB71_3: ; %end
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-NEXT: .LBB71_4:
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT: s_branch .LBB71_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -54758,107 +53603,107 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v80, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_u16 v81, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_u16 v49, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_u16 v82, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_u16 v50, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_u16 v83, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_u16 v51, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_u16 v84, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_u16 v52, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_u16 v85, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_u16 v87, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v96, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v97, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v98, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v99, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v100, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_u16 v101, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_u16 v102, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_u16 v103, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_u16 v112, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_u16 v113, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_u16 v53, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_u16 v54, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_u16 v55, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_u16 v64, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_u16 v65, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_u16 v66, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_u16 v67, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v68, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v69, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
+; GFX11-TRUE16-NEXT: scratch_load_u16 v70, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v71, off, s32 offset:4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v48.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v80.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v81.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v82.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v83.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v84.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v96.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v98.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v99.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v100.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v101.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v102.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v103.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v112.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v86
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -54869,95 +53714,95 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v0.l, v38.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v80.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v38.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v80, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v1.h, v37.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v80, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v2.l, v35.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v80, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v3.l, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v80, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v4.l, v31.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v70.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v80, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v5.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v68.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v80, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v6.l, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v67.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v80, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v80, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v80, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v9.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v51.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v80, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v10.l, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
@@ -54966,32 +53811,32 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
@@ -55009,199 +53854,199 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v80, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v80.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v80, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v12.l, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v80, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v80, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v14.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v80, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v80.l, v15.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v80.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v80, v15
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v38.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v39.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v38.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v37.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v33.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v36, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v35.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v35.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v36, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v36.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v34.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v34.h, v2.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v36, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v31.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v32.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v36, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v36, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v5.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v36, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v71.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v70.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v36, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v68.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v36, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v67.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v66.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v36, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v64.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v36, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v55.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v54.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v36, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v11.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v52.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v36, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v51.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v36, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v13.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v36, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v36.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v36, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v36.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v36, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -56981,775 +55826,390 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_branch .LBB87_2
;
-; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8f64_scalar:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v8, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v10, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB87_4
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s17, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s16, 0xff
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s6, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s7, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s21, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v31
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v32
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v38
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v83
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v84
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v85
-; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v82
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v68
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v69
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v71
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v80
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v81
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v65
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v50
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v39
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v55
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v87
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v96, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v86
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB87_3
-; GFX11-TRUE16-NEXT: .LBB87_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-TRUE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v31
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v32
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v38
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v83, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v84, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v85, v2
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v82, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v68, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v69, v11
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s29, 8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v37
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v52
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v70, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v71, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v80, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v65, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v66, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v67, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v50
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v49
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v48
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v16
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: .LBB87_3: ; %end
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB87_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_branch .LBB87_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v8f64_scalar:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v39, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v86, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v48, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v49, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v50, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v51, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v52, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v53, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v80, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v82, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v29
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 8, v8
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v12
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v14
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB87_4
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s6, 16
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s17, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s16, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s19, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s6, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s7, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s21, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s8, 16
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s9, 0xffff
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s27, 8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v31
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v32
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s9, 16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v38
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s8, s9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v83
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v84
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v85
-; GFX11-FAKE16-NEXT: s_and_b32 s11, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s29, 8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v82
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v68
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v69
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s10, 0xffff
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s10, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v34
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v36
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v37
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v18
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v71
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v80
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v81
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v6, v65
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v7, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v67
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v52
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v28
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v30
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v51
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v50
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v49
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v48
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v39
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v27
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v55
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v12, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v21
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v25
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v96, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v87
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v96, v14
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v86
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB87_3
-; GFX11-FAKE16-NEXT: .LBB87_2: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 16
-; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s17, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s19, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s21, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s2
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s23, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s4, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s25, 8
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s3, 0x300
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s2, s3
-; GFX11-FAKE16-NEXT: s_and_b32 s3, s4, 0xffff
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s27, 8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v31
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v32
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v38
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v33
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v22
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v83, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v84, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v85, v2
-; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v82, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v68, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v69, v11
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s29, 8
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v1, v2
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v35
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, s5, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v34
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v36
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v37
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v18
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v9
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v52
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v70, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v71, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v80, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v65, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v66, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v67, v9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v9
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v26
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v28
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v30
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v51
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v50
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v49
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v48
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v39
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v14
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xff, v15
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xff, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v27, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v29, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v54, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v55, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v17, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v19, v12
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v21, v14
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v23, v15
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v25, v16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v14
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v0, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v2, v3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v17
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v18, v14
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v16
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: .LBB87_3: ; %end
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT: .LBB87_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT: s_branch .LBB87_2
+; GFX11-LABEL: bitcast_v64i8_to_v8f64_scalar:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
+; GFX11-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
+; GFX11-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v4, off, s32
+; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v48, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v49, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v50, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v51, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v52, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v53, off, s32 offset:4
+; GFX11-NEXT: v_lshlrev_b32_e32 v83, 8, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v84, 8, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v85, 8, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v80, 8, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v82, 8, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v65, 8, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v67, 8, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v68, 8, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 8, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v29
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(15)
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v0
+; GFX11-NEXT: s_waitcnt vmcnt(13)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: s_waitcnt vmcnt(12)
+; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v4
+; GFX11-NEXT: s_waitcnt vmcnt(11)
+; GFX11-NEXT: v_lshlrev_b32_e32 v55, 8, v6
+; GFX11-NEXT: s_waitcnt vmcnt(10)
+; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v8
+; GFX11-NEXT: s_waitcnt vmcnt(9)
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v10
+; GFX11-NEXT: s_waitcnt vmcnt(8)
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v12
+; GFX11-NEXT: s_waitcnt vmcnt(7)
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v14
+; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v86
+; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT: s_cbranch_scc0 .LBB87_4
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: s_lshl_b32 s6, s6, 16
+; GFX11-NEXT: s_lshl_b32 s7, s17, 8
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_and_b32 s6, s16, 0xff
+; GFX11-NEXT: s_and_b32 s8, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s19, 8
+; GFX11-NEXT: s_or_b32 s6, s6, s7
+; GFX11-NEXT: s_or_b32 s7, s8, s9
+; GFX11-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX11-NEXT: s_lshl_b32 s7, s7, 16
+; GFX11-NEXT: s_and_b32 s8, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s21, 8
+; GFX11-NEXT: s_or_b32 s6, s6, s7
+; GFX11-NEXT: s_or_b32 s7, s8, s9
+; GFX11-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-NEXT: s_or_b32 s8, s8, s9
+; GFX11-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT: s_lshl_b32 s8, s8, 16
+; GFX11-NEXT: s_or_b32 s9, s9, s10
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_and_b32 s8, s9, 0xffff
+; GFX11-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s27, 8
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31
+; GFX11-NEXT: s_or_b32 s9, s9, s10
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32
+; GFX11-NEXT: s_lshl_b32 s9, s9, 16
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v38
+; GFX11-NEXT: s_or_b32 s8, s8, s9
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v83
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v84
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v22
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v24
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v85
+; GFX11-NEXT: s_and_b32 s11, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s12, s29, 8
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_or_b32_e32 v6, v6, v82
+; GFX11-NEXT: v_or_b32_e32 v10, v10, v68
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v69
+; GFX11-NEXT: s_or_b32 s10, s11, s12
+; GFX11-NEXT: v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT: s_and_b32 s10, s10, 0xffff
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v35
+; GFX11-NEXT: v_or_b32_e32 v4, s10, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v34
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v36
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v37
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v16
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v18
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v20
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v70
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v71
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v80
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v81
+; GFX11-NEXT: v_or_b32_e32 v6, v6, v65
+; GFX11-NEXT: v_or_b32_e32 v7, v7, v66
+; GFX11-NEXT: v_or_b32_e32 v9, v9, v67
+; GFX11-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v52
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v64
+; GFX11-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT: v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v26
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v28
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v53
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v51
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v50
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v49
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v48
+; GFX11-NEXT: v_and_b32_e32 v86, 0xff, v39
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v27
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v29
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v54
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v55
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v17
+; GFX11-NEXT: v_or_b32_e32 v12, v12, v19
+; GFX11-NEXT: v_or_b32_e32 v14, v14, v21
+; GFX11-NEXT: v_or_b32_e32 v15, v15, v23
+; GFX11-NEXT: v_or_b32_e32 v86, v86, v25
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v87, 16, v11
+; GFX11-NEXT: v_and_b32_e32 v96, 0xffff, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-NEXT: v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v13, v13, v87
+; GFX11-NEXT: v_or_b32_e32 v14, v96, v14
+; GFX11-NEXT: v_or_b32_e32 v15, v15, v86
+; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_vccnz .LBB87_3
+; GFX11-NEXT: .LBB87_2: ; %cmp.true
+; GFX11-NEXT: s_add_i32 s0, s0, 3
+; GFX11-NEXT: s_add_i32 s2, s2, 3
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s2
+; GFX11-NEXT: s_addk_i32 s0, 0x300
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 16
+; GFX11-NEXT: s_add_i32 s16, s16, 3
+; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: s_and_b32 s1, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s2, s17, 8
+; GFX11-NEXT: s_add_i32 s18, s18, 3
+; GFX11-NEXT: s_or_b32 s1, s2, s1
+; GFX11-NEXT: s_and_b32 s2, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s19, 8
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_add_i32 s20, s20, 3
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_and_b32 s3, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s21, 8
+; GFX11-NEXT: s_add_i32 s22, s22, 3
+; GFX11-NEXT: s_or_b32 s1, s1, s2
+; GFX11-NEXT: s_or_b32 s2, s4, s3
+; GFX11-NEXT: s_and_b32 s3, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s4, s23, 8
+; GFX11-NEXT: s_add_i32 s24, s24, 3
+; GFX11-NEXT: s_or_b32 s3, s4, s3
+; GFX11-NEXT: s_and_b32 s4, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s25, 8
+; GFX11-NEXT: s_addk_i32 s2, 0x300
+; GFX11-NEXT: s_addk_i32 s3, 0x300
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: s_add_i32 s26, s26, 3
+; GFX11-NEXT: s_or_b32 s2, s2, s3
+; GFX11-NEXT: s_and_b32 s3, s4, 0xffff
+; GFX11-NEXT: s_and_b32 s4, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s27, 8
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v31
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v32
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v38
+; GFX11-NEXT: s_lshl_b32 s4, s4, 16
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: s_or_b32 s3, s3, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v33
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v22
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v24
+; GFX11-NEXT: v_or_b32_e32 v0, v83, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v84, v1
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_or_b32_e32 v2, v85, v2
+; GFX11-NEXT: s_add_i32 s28, s28, 3
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_or_b32_e32 v6, v82, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_or_b32_e32 v10, v68, v10
+; GFX11-NEXT: v_or_b32_e32 v11, v69, v11
+; GFX11-NEXT: s_and_b32 s6, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s7, s29, 8
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_or_b32 s5, s7, s6
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x300, v10
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: s_addk_i32 s5, 0x300
+; GFX11-NEXT: v_or_b32_e32 v5, v1, v2
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v35
+; GFX11-NEXT: v_or_b32_e32 v4, s5, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v34
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v36
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v37
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v16
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v18
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v20
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX11-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v52
+; GFX11-NEXT: v_or_b32_e32 v0, v70, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v71, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v80, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v81, v3
+; GFX11-NEXT: v_or_b32_e32 v6, v65, v6
+; GFX11-NEXT: v_or_b32_e32 v7, v66, v7
+; GFX11-NEXT: v_or_b32_e32 v9, v67, v9
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x300, v9
+; GFX11-NEXT: v_or_b32_e32 v11, v64, v11
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v8, v8, v12
+; GFX11-NEXT: v_or_b32_e32 v9, v13, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v26
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v28
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v53
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v51
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v50
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v49
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v48
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v39
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14
+; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15
+; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16
+; GFX11-NEXT: v_or_b32_e32 v0, v27, v0
+; GFX11-NEXT: v_or_b32_e32 v1, v29, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v54, v2
+; GFX11-NEXT: v_or_b32_e32 v3, v55, v3
+; GFX11-NEXT: v_or_b32_e32 v11, v17, v11
+; GFX11-NEXT: v_or_b32_e32 v12, v19, v12
+; GFX11-NEXT: v_or_b32_e32 v14, v21, v14
+; GFX11-NEXT: v_or_b32_e32 v15, v23, v15
+; GFX11-NEXT: v_or_b32_e32 v16, v25, v16
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v11
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x300, v14
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x300, v15
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x300, v16
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v11
+; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_or_b32_e32 v11, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v12, v2, v3
+; GFX11-NEXT: v_or_b32_e32 v13, v13, v17
+; GFX11-NEXT: v_or_b32_e32 v14, v18, v14
+; GFX11-NEXT: v_or_b32_e32 v15, v15, v16
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: .LBB87_3: ; %end
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-NEXT: .LBB87_4:
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT: s_branch .LBB87_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -68419,45 +66879,45 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v80, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_u16 v39, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_u16 v51, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_u16 v81, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_u16 v49, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_u16 v37, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_u16 v52, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_u16 v82, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_u16 v50, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_u16 v83, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_u16 v34, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v35, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v36, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v38, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v84, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v85, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_u16 v86, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_u16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_u16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_u16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_u16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_u16 v70, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_u16 v54, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_u16 v71, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_u16 v64, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_u16 v67, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_u16 v66, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_u16 v69, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v53, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v65, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_u16 v55, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v68, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v17.l
@@ -68483,47 +66943,42 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v25.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v26.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v28.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v29.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v39.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.l, 8, v37.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v82.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.l, 8, v38.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v84.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.l, 8, v86.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.l, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v98.l
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB98_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -68551,22 +67006,22 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v67.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v48.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v23.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v23.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v20.l
@@ -68578,27 +67033,27 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v25.h
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v26.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v30.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v31.l
; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v28.h
; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v29.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v48.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v51.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v50.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v53.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v55.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v32.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v38.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v38.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v37.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v39.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
@@ -68615,22 +67070,22 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
@@ -68642,46 +67097,46 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2
; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v51.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v50.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v71.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v55.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.h, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v39.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v39.h, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v70.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v37.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v37.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
@@ -68689,46 +67144,46 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v36.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v38.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v69.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v54.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v67.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v54.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v38.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v34.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v35.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v66.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v35.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v68.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v55.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v49.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v48.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v48.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v39.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v36.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v32.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v32.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v31.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
@@ -68744,7 +67199,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v33.l, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v28.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v29.l, v1.l
@@ -68764,7 +67219,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v39.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v31.l, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v25.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v26.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v27.h, v1.l
@@ -70501,695 +68956,350 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_branch .LBB99_2
;
-; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32i16_scalar:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
-; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v8, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v10, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 8, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84
-; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v39
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v38
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v31
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v5, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v6, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v24
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v65
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v12, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v69
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v84
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, v12, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB99_3
-; GFX11-TRUE16-NEXT: .LBB99_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v65
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v71, v5
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v66, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v28
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v29, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v53, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v33
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v51, v4
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v50, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v49, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
-; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v84, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v83, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v48, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v85, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: .LBB99_3: ; %end
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB99_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_branch .LBB99_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v32i16_scalar:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB99_4
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v35
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v39
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-FAKE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v31
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v37
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v48
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v36
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v49
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v51
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v5, v50
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v6, v52
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v24
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v18
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v55
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v67
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v23
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v82
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v27
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v65
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v84
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v25
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff, v14
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB99_3
-; GFX11-FAKE16-NEXT: .LBB99_2: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v64
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
-; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v71, v5
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v66, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v28
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v29, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v25, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v21, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v55, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v54, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v17, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v36
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v53, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v33
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v31
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v52, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v51, v4
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v50, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v49, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
-; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v83, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v48, v8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v85, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: .LBB99_3: ; %end
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT: .LBB99_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT: s_branch .LBB99_2
+; GFX11-LABEL: bitcast_v64i8_to_v32i16_scalar:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
+; GFX11-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
+; GFX11-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
+; GFX11-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v4, off, s32
+; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:4
+; GFX11-NEXT: v_lshlrev_b32_e32 v39, 8, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v49, 8, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v55, 8, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 8, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v29
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(15)
+; GFX11-NEXT: v_lshlrev_b32_e32 v85, 8, v0
+; GFX11-NEXT: s_waitcnt vmcnt(13)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: s_waitcnt vmcnt(12)
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v4
+; GFX11-NEXT: s_waitcnt vmcnt(11)
+; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v6
+; GFX11-NEXT: s_waitcnt vmcnt(10)
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v8
+; GFX11-NEXT: s_waitcnt vmcnt(9)
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v10
+; GFX11-NEXT: s_waitcnt vmcnt(8)
+; GFX11-NEXT: v_lshlrev_b32_e32 v83, 8, v12
+; GFX11-NEXT: s_waitcnt vmcnt(7)
+; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v14
+; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: v_lshlrev_b32_e32 v84, 8, v84
+; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT: s_cbranch_scc0 .LBB99_4
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v35
+; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_and_b32 s10, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s11, s29, 8
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v39
+; GFX11-NEXT: s_or_b32 s10, s10, s11
+; GFX11-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-NEXT: v_and_b32_e64 v1, 0xffff, s10
+; GFX11-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v38
+; GFX11-NEXT: s_or_b32 s9, s9, s12
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v31
+; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v37
+; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v33
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v48
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v36
+; GFX11-NEXT: v_or_b32_e32 v1, v2, v49
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v32
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v51
+; GFX11-NEXT: v_or_b32_e32 v7, v5, v50
+; GFX11-NEXT: v_or_b32_e32 v8, v6, v52
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v53
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v24
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v68
+; GFX11-NEXT: v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v34
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v16
+; GFX11-NEXT: v_lshl_or_b32 v6, v7, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v18
+; GFX11-NEXT: v_lshl_or_b32 v7, v8, 16, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v54
+; GFX11-NEXT: v_or_b32_e32 v2, v3, v17
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v20
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v22
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v55
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v67
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v19
+; GFX11-NEXT: v_or_b32_e32 v10, v8, v23
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v9, v21
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v26
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT: v_lshl_or_b32 v9, v3, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v30
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v80
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v82
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v28
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v27
+; GFX11-NEXT: v_lshl_or_b32 v10, v1, 16, v10
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v64
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v66
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v70
+; GFX11-NEXT: v_or_b32_e32 v15, v12, v71
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v69
+; GFX11-NEXT: v_or_b32_e32 v13, v13, v83
+; GFX11-NEXT: v_and_b32_e32 v86, 0xff, v65
+; GFX11-NEXT: v_or_b32_e32 v14, v14, v84
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v25
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v29
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_and_b32_e32 v87, 0xffff, v11
+; GFX11-NEXT: v_or_b32_e32 v96, v12, v81
+; GFX11-NEXT: v_and_b32_e32 v97, 0xffff, v13
+; GFX11-NEXT: v_or_b32_e32 v86, v86, v85
+; GFX11-NEXT: v_and_b32_e32 v98, 0xffff, v14
+; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v2
+; GFX11-NEXT: v_lshl_or_b32 v12, v1, 16, v3
+; GFX11-NEXT: v_lshl_or_b32 v13, v15, 16, v87
+; GFX11-NEXT: v_lshl_or_b32 v14, v96, 16, v97
+; GFX11-NEXT: v_lshl_or_b32 v15, v86, 16, v98
+; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_vccnz .LBB99_3
+; GFX11-NEXT: .LBB99_2: ; %cmp.true
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v68
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v67
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v64
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v22
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v16
+; GFX11-NEXT: s_add_i32 s28, s28, 3
+; GFX11-NEXT: v_or_b32_e32 v4, v70, v4
+; GFX11-NEXT: v_or_b32_e32 v5, v71, v5
+; GFX11-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-NEXT: s_add_i32 s24, s24, 3
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v66, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v26
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v28
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v29, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v24
+; GFX11-NEXT: v_or_b32_e32 v4, v27, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v25, v6
+; GFX11-NEXT: v_or_b32_e32 v6, v23, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v18
+; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v34
+; GFX11-NEXT: v_or_b32_e32 v5, v21, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v20
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v7
+; GFX11-NEXT: v_or_b32_e32 v4, v55, v4
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v32
+; GFX11-NEXT: v_or_b32_e32 v5, v19, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
+; GFX11-NEXT: v_or_b32_e32 v4, v54, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v17, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT: s_add_i32 s26, s26, 3
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v36
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x300, v7
+; GFX11-NEXT: v_or_b32_e32 v4, v53, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v33
+; GFX11-NEXT: s_or_b32 s5, s6, s5
+; GFX11-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-NEXT: s_add_i32 s20, s20, 3
+; GFX11-NEXT: s_or_b32 s6, s7, s6
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_add_i32 s22, s22, 3
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v31
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v37
+; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v7
+; GFX11-NEXT: s_or_b32 s7, s8, s7
+; GFX11-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-NEXT: s_add_i32 s16, s16, 3
+; GFX11-NEXT: s_or_b32 s8, s9, s8
+; GFX11-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-NEXT: s_add_i32 s18, s18, 3
+; GFX11-NEXT: s_add_i32 s0, s0, 3
+; GFX11-NEXT: s_add_i32 s2, s2, 3
+; GFX11-NEXT: s_or_b32 s9, s10, s9
+; GFX11-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-NEXT: v_or_b32_e32 v5, v52, v5
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v20
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v38
+; GFX11-NEXT: v_or_b32_e32 v4, v51, v4
+; GFX11-NEXT: s_or_b32 s10, s11, s10
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s2
+; GFX11-NEXT: s_addk_i32 s5, 0x300
+; GFX11-NEXT: s_addk_i32 s6, 0x300
+; GFX11-NEXT: s_addk_i32 s9, 0x300
+; GFX11-NEXT: s_addk_i32 s0, 0x300
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_addk_i32 s10, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v50, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v49, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v20
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v35
+; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v82
+; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-NEXT: s_addk_i32 s7, 0x300
+; GFX11-NEXT: s_addk_i32 s8, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v65
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v69
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_or_b32_e32 v0, v84, v0
+; GFX11-NEXT: v_or_b32_e32 v2, v83, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v48, v8
+; GFX11-NEXT: v_or_b32_e32 v4, v39, v4
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: v_or_b32_e32 v1, v85, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_or_b32_e32 v3, v81, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT: v_and_b32_e64 v8, 0xffff, s4
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v8
+; GFX11-NEXT: v_lshl_or_b32 v5, v7, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v20
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v21
+; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v6
+; GFX11-NEXT: v_lshl_or_b32 v9, v9, 16, v16
+; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v11
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v6, v23, 16, v7
+; GFX11-NEXT: v_lshl_or_b32 v7, v22, 16, v8
+; GFX11-NEXT: v_lshl_or_b32 v8, v19, 16, v17
+; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v20
+; GFX11-NEXT: v_lshl_or_b32 v11, v18, 16, v15
+; GFX11-NEXT: v_lshl_or_b32 v12, v14, 16, v12
+; GFX11-NEXT: v_lshl_or_b32 v13, v13, 16, v16
+; GFX11-NEXT: v_lshl_or_b32 v14, v3, 16, v2
+; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: .LBB99_3: ; %end
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-NEXT: .LBB99_4:
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT: s_branch .LBB99_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -80636,45 +78746,45 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v80, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_u16 v39, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_u16 v51, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_u16 v81, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_u16 v49, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_u16 v37, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_u16 v52, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_u16 v82, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_u16 v50, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_u16 v83, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_u16 v34, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v35, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v36, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v38, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v84, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v85, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_u16 v86, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_u16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_u16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_u16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_u16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_u16 v70, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_u16 v54, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_u16 v71, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_u16 v64, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_u16 v67, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_u16 v66, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_u16 v69, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v53, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v65, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_u16 v55, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v68, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v17.l
@@ -80700,47 +78810,42 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v25.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v26.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v28.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v29.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v39.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.l, 8, v37.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v82.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.l, 8, v38.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v84.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.l, 8, v86.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.l, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v98.l
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB106_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -80768,22 +78873,22 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v67.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v48.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v23.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v23.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v20.l
@@ -80795,27 +78900,27 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v25.h
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v26.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v30.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v31.l
; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v28.h
; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v29.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v48.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v51.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v50.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v53.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v55.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v32.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v38.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v38.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v37.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v39.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
@@ -80832,22 +78937,22 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
@@ -80859,46 +78964,46 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2
; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v51.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v50.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v71.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v55.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.h, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v39.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v39.h, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v70.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v37.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v37.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
@@ -80906,46 +79011,46 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v36.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v38.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v69.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v54.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v67.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v54.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v38.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v34.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v35.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v66.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v35.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v68.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v55.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v49.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v48.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v48.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v39.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v36.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v32.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v32.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v31.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
@@ -80961,7 +79066,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v33.l, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v28.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v29.l, v1.l
@@ -80981,7 +79086,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v39.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v31.l, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v25.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v26.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v27.h, v1.l
@@ -82684,695 +80789,350 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_branch .LBB107_2
;
-; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32f16_scalar:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
-; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v8, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v10, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 8, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84
-; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v39
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v38
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v31
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v5, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v6, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v24
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v65
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v12, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v69
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v84
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, v12, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB107_3
-; GFX11-TRUE16-NEXT: .LBB107_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v65
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v71, v5
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v66, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v28
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v29, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v53, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v33
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v51, v4
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v50, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v49, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
-; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v84, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v83, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v48, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v85, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: .LBB107_3: ; %end
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB107_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_branch .LBB107_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v32f16_scalar:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB107_4
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v35
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v39
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-FAKE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v31
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v37
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v48
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v36
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v49
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v51
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v5, v50
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v6, v52
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v24
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v18
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v55
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v67
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v23
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v82
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v27
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v65
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v84
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v25
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff, v14
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB107_3
-; GFX11-FAKE16-NEXT: .LBB107_2: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v64
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
-; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v71, v5
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v66, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v28
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v29, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v25, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v21, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v55, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v54, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v17, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v36
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v53, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v33
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v31
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v52, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v51, v4
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v50, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v49, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
-; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v83, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v48, v8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v85, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: .LBB107_3: ; %end
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT: .LBB107_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT: s_branch .LBB107_2
+; GFX11-LABEL: bitcast_v64i8_to_v32f16_scalar:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
+; GFX11-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
+; GFX11-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
+; GFX11-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v4, off, s32
+; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:4
+; GFX11-NEXT: v_lshlrev_b32_e32 v39, 8, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v49, 8, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v55, 8, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 8, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v29
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(15)
+; GFX11-NEXT: v_lshlrev_b32_e32 v85, 8, v0
+; GFX11-NEXT: s_waitcnt vmcnt(13)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: s_waitcnt vmcnt(12)
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v4
+; GFX11-NEXT: s_waitcnt vmcnt(11)
+; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v6
+; GFX11-NEXT: s_waitcnt vmcnt(10)
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v8
+; GFX11-NEXT: s_waitcnt vmcnt(9)
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v10
+; GFX11-NEXT: s_waitcnt vmcnt(8)
+; GFX11-NEXT: v_lshlrev_b32_e32 v83, 8, v12
+; GFX11-NEXT: s_waitcnt vmcnt(7)
+; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v14
+; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: v_lshlrev_b32_e32 v84, 8, v84
+; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT: s_cbranch_scc0 .LBB107_4
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v35
+; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_and_b32 s10, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s11, s29, 8
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v39
+; GFX11-NEXT: s_or_b32 s10, s10, s11
+; GFX11-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-NEXT: v_and_b32_e64 v1, 0xffff, s10
+; GFX11-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v38
+; GFX11-NEXT: s_or_b32 s9, s9, s12
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v31
+; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v37
+; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v33
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v48
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v36
+; GFX11-NEXT: v_or_b32_e32 v1, v2, v49
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v32
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v51
+; GFX11-NEXT: v_or_b32_e32 v7, v5, v50
+; GFX11-NEXT: v_or_b32_e32 v8, v6, v52
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v53
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v24
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v68
+; GFX11-NEXT: v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v34
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v16
+; GFX11-NEXT: v_lshl_or_b32 v6, v7, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v18
+; GFX11-NEXT: v_lshl_or_b32 v7, v8, 16, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v54
+; GFX11-NEXT: v_or_b32_e32 v2, v3, v17
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v20
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v22
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v55
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v67
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v19
+; GFX11-NEXT: v_or_b32_e32 v10, v8, v23
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v9, v21
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v26
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT: v_lshl_or_b32 v9, v3, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v30
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v80
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v82
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v28
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v27
+; GFX11-NEXT: v_lshl_or_b32 v10, v1, 16, v10
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v64
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v66
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v70
+; GFX11-NEXT: v_or_b32_e32 v15, v12, v71
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v69
+; GFX11-NEXT: v_or_b32_e32 v13, v13, v83
+; GFX11-NEXT: v_and_b32_e32 v86, 0xff, v65
+; GFX11-NEXT: v_or_b32_e32 v14, v14, v84
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v25
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v29
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_and_b32_e32 v87, 0xffff, v11
+; GFX11-NEXT: v_or_b32_e32 v96, v12, v81
+; GFX11-NEXT: v_and_b32_e32 v97, 0xffff, v13
+; GFX11-NEXT: v_or_b32_e32 v86, v86, v85
+; GFX11-NEXT: v_and_b32_e32 v98, 0xffff, v14
+; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v2
+; GFX11-NEXT: v_lshl_or_b32 v12, v1, 16, v3
+; GFX11-NEXT: v_lshl_or_b32 v13, v15, 16, v87
+; GFX11-NEXT: v_lshl_or_b32 v14, v96, 16, v97
+; GFX11-NEXT: v_lshl_or_b32 v15, v86, 16, v98
+; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_vccnz .LBB107_3
+; GFX11-NEXT: .LBB107_2: ; %cmp.true
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v68
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v67
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v64
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v22
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v16
+; GFX11-NEXT: s_add_i32 s28, s28, 3
+; GFX11-NEXT: v_or_b32_e32 v4, v70, v4
+; GFX11-NEXT: v_or_b32_e32 v5, v71, v5
+; GFX11-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-NEXT: s_add_i32 s24, s24, 3
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v66, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v26
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v28
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v29, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v24
+; GFX11-NEXT: v_or_b32_e32 v4, v27, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v25, v6
+; GFX11-NEXT: v_or_b32_e32 v6, v23, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v18
+; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v34
+; GFX11-NEXT: v_or_b32_e32 v5, v21, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v20
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v7
+; GFX11-NEXT: v_or_b32_e32 v4, v55, v4
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v32
+; GFX11-NEXT: v_or_b32_e32 v5, v19, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
+; GFX11-NEXT: v_or_b32_e32 v4, v54, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v17, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT: s_add_i32 s26, s26, 3
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v36
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x300, v7
+; GFX11-NEXT: v_or_b32_e32 v4, v53, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v33
+; GFX11-NEXT: s_or_b32 s5, s6, s5
+; GFX11-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-NEXT: s_add_i32 s20, s20, 3
+; GFX11-NEXT: s_or_b32 s6, s7, s6
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_add_i32 s22, s22, 3
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v31
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v37
+; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v7
+; GFX11-NEXT: s_or_b32 s7, s8, s7
+; GFX11-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-NEXT: s_add_i32 s16, s16, 3
+; GFX11-NEXT: s_or_b32 s8, s9, s8
+; GFX11-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-NEXT: s_add_i32 s18, s18, 3
+; GFX11-NEXT: s_add_i32 s0, s0, 3
+; GFX11-NEXT: s_add_i32 s2, s2, 3
+; GFX11-NEXT: s_or_b32 s9, s10, s9
+; GFX11-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-NEXT: v_or_b32_e32 v5, v52, v5
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v20
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v38
+; GFX11-NEXT: v_or_b32_e32 v4, v51, v4
+; GFX11-NEXT: s_or_b32 s10, s11, s10
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s2
+; GFX11-NEXT: s_addk_i32 s5, 0x300
+; GFX11-NEXT: s_addk_i32 s6, 0x300
+; GFX11-NEXT: s_addk_i32 s9, 0x300
+; GFX11-NEXT: s_addk_i32 s0, 0x300
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_addk_i32 s10, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v50, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v49, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v20
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v35
+; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v82
+; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-NEXT: s_addk_i32 s7, 0x300
+; GFX11-NEXT: s_addk_i32 s8, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v65
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v69
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_or_b32_e32 v0, v84, v0
+; GFX11-NEXT: v_or_b32_e32 v2, v83, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v48, v8
+; GFX11-NEXT: v_or_b32_e32 v4, v39, v4
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: v_or_b32_e32 v1, v85, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_or_b32_e32 v3, v81, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT: v_and_b32_e64 v8, 0xffff, s4
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v8
+; GFX11-NEXT: v_lshl_or_b32 v5, v7, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v20
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v21
+; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v6
+; GFX11-NEXT: v_lshl_or_b32 v9, v9, 16, v16
+; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v11
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v6, v23, 16, v7
+; GFX11-NEXT: v_lshl_or_b32 v7, v22, 16, v8
+; GFX11-NEXT: v_lshl_or_b32 v8, v19, 16, v17
+; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v20
+; GFX11-NEXT: v_lshl_or_b32 v11, v18, 16, v15
+; GFX11-NEXT: v_lshl_or_b32 v12, v14, 16, v12
+; GFX11-NEXT: v_lshl_or_b32 v13, v13, 16, v16
+; GFX11-NEXT: v_lshl_or_b32 v14, v3, 16, v2
+; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: .LBB107_3: ; %end
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-NEXT: .LBB107_4:
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT: s_branch .LBB107_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -91142,45 +88902,45 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_u16 v80, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_u16 v48, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_u16 v39, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_u16 v51, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_u16 v81, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_u16 v49, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_u16 v37, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_u16 v52, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_u16 v82, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_u16 v50, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_u16 v83, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_u16 v34, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v35, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u16 v36, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_u16 v38, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_u16 v84, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_u16 v85, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_u16 v86, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_u16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_u16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_u16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_u16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_u16 v70, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_u16 v54, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_u16 v71, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_u16 v64, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_u16 v67, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_u16 v66, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_u16 v69, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_u16 v53, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_u16 v65, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_u16 v55, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_u16 v68, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v17.l
@@ -91206,47 +88966,42 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v25.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v26.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v28.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v29.h
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v29.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v55.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v39.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v39.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.l, 8, v37.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v48.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v82.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.l, 8, v38.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v84.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.l, 8, v86.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v38.l, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v65.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v66
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.l, 8, v98.l
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB110_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -91274,22 +89029,22 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v67.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v48.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v23.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v23.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v20.l
@@ -91301,27 +89056,27 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v25.h
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v26.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v30.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v31.l
; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v28.h
; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v29.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v48.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v51.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v50.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v53.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v55.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v32.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v38.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v38.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v37.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v39.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
@@ -91338,22 +89093,22 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
@@ -91365,46 +89120,46 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2
; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v51.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v50.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v71.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v55.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.h, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v39.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v39.h, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v70.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v37.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v37.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
@@ -91412,46 +89167,46 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.l, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v64.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v36.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v38.l, v0.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v69.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v54.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v67.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v54.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v38.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v34.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v35.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v50.l, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v66.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v35.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v68.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v55.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v49.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v48.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v48.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v39.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v36.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v32.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v32.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v31.h, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
@@ -91467,7 +89222,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v33.l, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v28.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v29.l, v1.l
@@ -91487,7 +89242,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v39.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v31.l, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v25.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v26.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v27.h, v1.l
@@ -93194,695 +90949,350 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX9-NEXT: s_branch .LBB111_2
;
-; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32bf16_scalar:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
-; GFX11-TRUE16-NEXT: s_clause 0xf
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v4, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v6, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v8, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v10, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v12, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v14, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v48, 8, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 8, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 8, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 8, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 8, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 8, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 8, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84
-; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v39
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v38
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v31
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v37
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v48
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v49
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v5, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v6, v52
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v53
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v24
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v68
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v67
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v8, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v80
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v27
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v65
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v70
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v12, v71
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v69
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v83
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v84
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v87, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, v12, v81
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v97, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, v86, v85
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB111_3
-; GFX11-TRUE16-NEXT: .LBB111_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v65
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v70, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v71, v5
-; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v66, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v28
-; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v29, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v36
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v53, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v33
-; GFX11-TRUE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-TRUE16-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s6, s7, s6
-; GFX11-TRUE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT: s_or_b32 s7, s8, s7
-; GFX11-TRUE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8
-; GFX11-TRUE16-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s9
-; GFX11-TRUE16-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v51, v4
-; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s10
-; GFX11-TRUE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s9, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v50, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v49, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
-; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300
-; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v64
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v84, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v83, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v48, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v85, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT: .LBB111_3: ; %end
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-TRUE16-NEXT: .LBB111_4:
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_branch .LBB111_2
-;
-; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v32bf16_scalar:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
-; GFX11-FAKE16-NEXT: s_clause 0xf
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u16 v65, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u16 v4, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v6, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v8, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v10, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u16 v12, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u16 v14, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u16 v84, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u16 v82, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u16 v69, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u16 v80, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u16 v67, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u16 v68, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v64, off, s32 offset:4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v39, 8, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v49, 8, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 8, v5
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 8, v7
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 8, v9
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 8, v11
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v52, 8, v13
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 8, v15
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 8, v19
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 8, v21
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 8, v23
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 8, v25
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 8, v27
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 8, v29
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 8, v0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 8, v4
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 8, v6
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 8, v8
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 8, v10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v83, 8, v12
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 8, v14
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v84, 8, v84
-; GFX11-FAKE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB111_4
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s17, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s19, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s23, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s25, 8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v35
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s10
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s29, 8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v39
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s10, s11
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s26, 0xff
-; GFX11-FAKE16-NEXT: v_and_b32_e64 v1, 0xffff, s10
-; GFX11-FAKE16-NEXT: s_lshl_b32 s12, s27, 8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v38
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s9, s12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v31
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v37
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v33
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v48
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v36
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v49
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v51
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v5, v50
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v6, v52
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v53
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xff, v24
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xff, v68
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v34
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v16
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v18
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v54
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v3, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v22
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v55
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v67
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v19
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v8, v23
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v9, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v30
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xff, v80
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xff, v82
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v28
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v27
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v1, 16, v10
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v64
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v3, v66
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v70
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v12, v71
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xff, v69
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v13, v83
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xff, v65
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v14, v84
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v25
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v29
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v87, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v96, v12, v81
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff, v13
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v86, v86, v85
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff, v14
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
-; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB111_3
-; GFX11-FAKE16-NEXT: .LBB111_2: ; %cmp.true
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v68
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v67
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v30
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v64
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v22
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v16
-; GFX11-FAKE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v70, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v71, v5
-; GFX11-FAKE16-NEXT: s_and_b32 s4, s28, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s29, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v66, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v26
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v28
-; GFX11-FAKE16-NEXT: s_or_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v29, v6
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v24
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v25, v6
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, v23, v7
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v18
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v34
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v21, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s24, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s25, 8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v55, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v32
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v19, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v54, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v17, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v9
-; GFX11-FAKE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v36
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v53, v8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v33
-; GFX11-FAKE16-NEXT: s_or_b32 s5, s6, s5
-; GFX11-FAKE16-NEXT: s_and_b32 s6, s26, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s7, s27, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s6, s7, s6
-; GFX11-FAKE16-NEXT: s_and_b32 s7, s20, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s8, s21, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v31
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v37
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v7
-; GFX11-FAKE16-NEXT: s_or_b32 s7, s8, s7
-; GFX11-FAKE16-NEXT: s_and_b32 s8, s22, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s9, s23, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s8, s9, s8
-; GFX11-FAKE16-NEXT: s_and_b32 s9, s16, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s10, s17, 8
-; GFX11-FAKE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-FAKE16-NEXT: s_or_b32 s9, s10, s9
-; GFX11-FAKE16-NEXT: s_and_b32 s10, s18, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s11, s19, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0xff
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v52, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v38
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v51, v4
-; GFX11-FAKE16-NEXT: s_or_b32 s10, s11, s10
-; GFX11-FAKE16-NEXT: s_or_b32 s0, s1, s0
-; GFX11-FAKE16-NEXT: s_or_b32 s1, s3, s2
-; GFX11-FAKE16-NEXT: s_addk_i32 s5, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s6, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s9, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s1, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s10, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v50, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v49, v8
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xff, v20
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v35
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v82
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v80
-; GFX11-FAKE16-NEXT: s_addk_i32 s7, 0x300
-; GFX11-FAKE16-NEXT: s_addk_i32 s8, 0x300
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v65
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v69
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v84, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v83, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, v48, v8
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-FAKE16-NEXT: s_addk_i32 s4, 0x300
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v85, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v81, v3
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e64 v8, 0xffff, s4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff, v20
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff, v21
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff, v6
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-FAKE16-NEXT: .LBB111_3: ; %end
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-; GFX11-FAKE16-NEXT: .LBB111_4:
-; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-FAKE16-NEXT: s_branch .LBB111_2
+; GFX11-LABEL: bitcast_v64i8_to_v32bf16_scalar:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v31, v8
+; GFX11-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v33, v6
+; GFX11-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v35, v0
+; GFX11-NEXT: v_dual_mov_b32 v38, v4 :: v_dual_mov_b32 v37, v2
+; GFX11-NEXT: s_clause 0xf
+; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:52
+; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u16 v4, off, s32
+; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:4
+; GFX11-NEXT: v_lshlrev_b32_e32 v39, 8, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v49, 8, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v48, 8, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v51, 8, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v50, 8, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v53, 8, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v52, 8, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v54, 8, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v55, 8, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 8, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 8, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 8, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 8, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 8, v29
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt vmcnt(15)
+; GFX11-NEXT: v_lshlrev_b32_e32 v85, 8, v0
+; GFX11-NEXT: s_waitcnt vmcnt(13)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: s_waitcnt vmcnt(12)
+; GFX11-NEXT: v_lshlrev_b32_e32 v66, 8, v4
+; GFX11-NEXT: s_waitcnt vmcnt(11)
+; GFX11-NEXT: v_lshlrev_b32_e32 v29, 8, v6
+; GFX11-NEXT: s_waitcnt vmcnt(10)
+; GFX11-NEXT: v_lshlrev_b32_e32 v70, 8, v8
+; GFX11-NEXT: s_waitcnt vmcnt(9)
+; GFX11-NEXT: v_lshlrev_b32_e32 v71, 8, v10
+; GFX11-NEXT: s_waitcnt vmcnt(8)
+; GFX11-NEXT: v_lshlrev_b32_e32 v83, 8, v12
+; GFX11-NEXT: s_waitcnt vmcnt(7)
+; GFX11-NEXT: v_lshlrev_b32_e32 v81, 8, v14
+; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: v_lshlrev_b32_e32 v84, 8, v84
+; GFX11-NEXT: s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-NEXT: s_cbranch_scc0 .LBB111_4
+; GFX11-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-NEXT: s_and_b32 s5, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s1, 8
+; GFX11-NEXT: s_and_b32 s7, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s3, 8
+; GFX11-NEXT: s_or_b32 s5, s5, s6
+; GFX11-NEXT: s_or_b32 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s17, 8
+; GFX11-NEXT: s_and_b32 s9, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s19, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-NEXT: s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_and_b32 s9, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s23, 8
+; GFX11-NEXT: s_or_b32 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_and_b32 s9, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s25, 8
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v35
+; GFX11-NEXT: s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-NEXT: s_or_b32 s8, s9, s10
+; GFX11-NEXT: s_and_b32 s10, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s11, s29, 8
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v39
+; GFX11-NEXT: s_or_b32 s10, s10, s11
+; GFX11-NEXT: s_and_b32 s9, s26, 0xff
+; GFX11-NEXT: v_and_b32_e64 v1, 0xffff, s10
+; GFX11-NEXT: s_lshl_b32 s12, s27, 8
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v38
+; GFX11-NEXT: s_or_b32 s9, s9, s12
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v31
+; GFX11-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v37
+; GFX11-NEXT: v_lshl_or_b32 v4, v0, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v33
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v48
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v36
+; GFX11-NEXT: v_or_b32_e32 v1, v2, v49
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v32
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v51
+; GFX11-NEXT: v_or_b32_e32 v7, v5, v50
+; GFX11-NEXT: v_or_b32_e32 v8, v6, v52
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v53
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v24
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v68
+; GFX11-NEXT: v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v34
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v16
+; GFX11-NEXT: v_lshl_or_b32 v6, v7, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v18
+; GFX11-NEXT: v_lshl_or_b32 v7, v8, 16, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v54
+; GFX11-NEXT: v_or_b32_e32 v2, v3, v17
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v20
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v22
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v55
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v67
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v19
+; GFX11-NEXT: v_or_b32_e32 v10, v8, v23
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v8, v2, 16, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v9, v21
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v26
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-NEXT: v_lshl_or_b32 v9, v3, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v30
+; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v80
+; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v82
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v28
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v27
+; GFX11-NEXT: v_lshl_or_b32 v10, v1, 16, v10
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v64
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v66
+; GFX11-NEXT: v_or_b32_e32 v11, v11, v70
+; GFX11-NEXT: v_or_b32_e32 v15, v12, v71
+; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v69
+; GFX11-NEXT: v_or_b32_e32 v13, v13, v83
+; GFX11-NEXT: v_and_b32_e32 v86, 0xff, v65
+; GFX11-NEXT: v_or_b32_e32 v14, v14, v84
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v25
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v29
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_and_b32_e32 v87, 0xffff, v11
+; GFX11-NEXT: v_or_b32_e32 v96, v12, v81
+; GFX11-NEXT: v_and_b32_e32 v97, 0xffff, v13
+; GFX11-NEXT: v_or_b32_e32 v86, v86, v85
+; GFX11-NEXT: v_and_b32_e32 v98, 0xffff, v14
+; GFX11-NEXT: v_lshl_or_b32 v11, v0, 16, v2
+; GFX11-NEXT: v_lshl_or_b32 v12, v1, 16, v3
+; GFX11-NEXT: v_lshl_or_b32 v13, v15, 16, v87
+; GFX11-NEXT: v_lshl_or_b32 v14, v96, 16, v97
+; GFX11-NEXT: v_lshl_or_b32 v15, v86, 16, v98
+; GFX11-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_vccnz .LBB111_3
+; GFX11-NEXT: .LBB111_2: ; %cmp.true
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v68
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v67
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v64
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v22
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v16
+; GFX11-NEXT: s_add_i32 s28, s28, 3
+; GFX11-NEXT: v_or_b32_e32 v4, v70, v4
+; GFX11-NEXT: v_or_b32_e32 v5, v71, v5
+; GFX11-NEXT: s_and_b32 s4, s28, 0xff
+; GFX11-NEXT: s_lshl_b32 s5, s29, 8
+; GFX11-NEXT: s_add_i32 s24, s24, 3
+; GFX11-NEXT: v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v13, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v4, v66, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v26
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v28
+; GFX11-NEXT: s_or_b32 s4, s5, s4
+; GFX11-NEXT: v_add_nc_u32_e32 v12, 0x300, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v29, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v24
+; GFX11-NEXT: v_or_b32_e32 v4, v27, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v14, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v25, v6
+; GFX11-NEXT: v_or_b32_e32 v6, v23, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v18
+; GFX11-NEXT: v_add_nc_u32_e32 v18, 0x300, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v34
+; GFX11-NEXT: v_or_b32_e32 v5, v21, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v20
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: s_and_b32 s5, s24, 0xff
+; GFX11-NEXT: s_lshl_b32 s6, s25, 8
+; GFX11-NEXT: v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v7
+; GFX11-NEXT: v_or_b32_e32 v4, v55, v4
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v32
+; GFX11-NEXT: v_or_b32_e32 v5, v19, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v16, 0x300, v4
+; GFX11-NEXT: v_or_b32_e32 v4, v54, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v17, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9
+; GFX11-NEXT: s_add_i32 s26, s26, 3
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x300, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v36
+; GFX11-NEXT: v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v19, 0x300, v7
+; GFX11-NEXT: v_or_b32_e32 v4, v53, v8
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v33
+; GFX11-NEXT: s_or_b32 s5, s6, s5
+; GFX11-NEXT: s_and_b32 s6, s26, 0xff
+; GFX11-NEXT: s_lshl_b32 s7, s27, 8
+; GFX11-NEXT: s_add_i32 s20, s20, 3
+; GFX11-NEXT: s_or_b32 s6, s7, s6
+; GFX11-NEXT: s_and_b32 s7, s20, 0xff
+; GFX11-NEXT: s_lshl_b32 s8, s21, 8
+; GFX11-NEXT: s_add_i32 s22, s22, 3
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v31
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v37
+; GFX11-NEXT: v_add_nc_u32_e32 v21, 0x300, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v7
+; GFX11-NEXT: s_or_b32 s7, s8, s7
+; GFX11-NEXT: s_and_b32 s8, s22, 0xff
+; GFX11-NEXT: s_lshl_b32 s9, s23, 8
+; GFX11-NEXT: s_add_i32 s16, s16, 3
+; GFX11-NEXT: s_or_b32 s8, s9, s8
+; GFX11-NEXT: s_and_b32 s9, s16, 0xff
+; GFX11-NEXT: s_lshl_b32 s10, s17, 8
+; GFX11-NEXT: s_add_i32 s18, s18, 3
+; GFX11-NEXT: s_add_i32 s0, s0, 3
+; GFX11-NEXT: s_add_i32 s2, s2, 3
+; GFX11-NEXT: s_or_b32 s9, s10, s9
+; GFX11-NEXT: s_and_b32 s10, s18, 0xff
+; GFX11-NEXT: s_lshl_b32 s11, s19, 8
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-NEXT: v_or_b32_e32 v5, v52, v5
+; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v20
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v38
+; GFX11-NEXT: v_or_b32_e32 v4, v51, v4
+; GFX11-NEXT: s_or_b32 s10, s11, s10
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s2
+; GFX11-NEXT: s_addk_i32 s5, 0x300
+; GFX11-NEXT: s_addk_i32 s6, 0x300
+; GFX11-NEXT: s_addk_i32 s9, 0x300
+; GFX11-NEXT: s_addk_i32 s0, 0x300
+; GFX11-NEXT: s_addk_i32 s1, 0x300
+; GFX11-NEXT: s_addk_i32 s10, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x300, v5
+; GFX11-NEXT: v_or_b32_e32 v5, v50, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v49, v8
+; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v20
+; GFX11-NEXT: v_add_nc_u32_e32 v20, 0x300, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v35
+; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT: s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v82
+; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v80
+; GFX11-NEXT: s_addk_i32 s7, 0x300
+; GFX11-NEXT: s_addk_i32 s8, 0x300
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v65
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v69
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_or_b32_e32 v0, v84, v0
+; GFX11-NEXT: v_or_b32_e32 v2, v83, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v23, 0x300, v5
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x300, v7
+; GFX11-NEXT: v_or_b32_e32 v7, v48, v8
+; GFX11-NEXT: v_or_b32_e32 v4, v39, v4
+; GFX11-NEXT: s_addk_i32 s4, 0x300
+; GFX11-NEXT: v_or_b32_e32 v1, v85, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-NEXT: v_or_b32_e32 v3, v81, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-NEXT: v_and_b32_e64 v8, 0xffff, s4
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v8
+; GFX11-NEXT: v_lshl_or_b32 v5, v7, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v20
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v21
+; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v6
+; GFX11-NEXT: v_lshl_or_b32 v9, v9, 16, v16
+; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v11
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v6, v23, 16, v7
+; GFX11-NEXT: v_lshl_or_b32 v7, v22, 16, v8
+; GFX11-NEXT: v_lshl_or_b32 v8, v19, 16, v17
+; GFX11-NEXT: v_lshl_or_b32 v10, v10, 16, v20
+; GFX11-NEXT: v_lshl_or_b32 v11, v18, 16, v15
+; GFX11-NEXT: v_lshl_or_b32 v12, v14, 16, v12
+; GFX11-NEXT: v_lshl_or_b32 v13, v13, 16, v16
+; GFX11-NEXT: v_lshl_or_b32 v14, v3, 16, v2
+; GFX11-NEXT: v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: .LBB111_3: ; %end
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-NEXT: .LBB111_4:
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-NEXT: s_branch .LBB111_2
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 10e523d1a0cf1..c81d847896476 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -61,21 +61,13 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11TRUE16-LABEL: test_load_store:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: test_load_store:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_load_store:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b16 v[2:3], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load bfloat, ptr addrspace(1) %in
store bfloat %val, ptr addrspace(1) %out
ret void
@@ -3652,21 +3644,13 @@ define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %ou
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11TRUE16-LABEL: test_bitcast_from_bfloat:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: test_bitcast_from_bfloat:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_bitcast_from_bfloat:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b16 v[2:3], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load bfloat, ptr addrspace(1) %in
%val_int = bitcast bfloat %val to i16
store i16 %val_int, ptr addrspace(1) %out
@@ -3726,21 +3710,13 @@ define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GFX10-NEXT: global_store_short v[0:1], v2, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11TRUE16-LABEL: test_bitcast_to_bfloat:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT: global_store_b16 v[0:1], v2, off
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: test_bitcast_to_bfloat:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: global_load_u16 v2, v[2:3], off
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v2, off
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_bitcast_to_bfloat:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v2, v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load i16, ptr addrspace(1) %in
%val_fp = bitcast i16 %val to bfloat
store bfloat %val_fp, ptr addrspace(1) %out
@@ -5676,23 +5652,14 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11TRUE16-LABEL: test_alloca_load_store_ret:
-; GFX11TRUE16: ; %bb.0: ; %entry
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
-; GFX11TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: test_alloca_load_store_ret:
-; GFX11FAKE16: ; %bb.0: ; %entry
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
-; GFX11FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_alloca_load_store_ret:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%in.addr = alloca bfloat, align 2, addrspace(5)
store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2
@@ -45726,34 +45693,34 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: s_clause 0x1f
-; GFX11TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32
+; GFX11TRUE16-NEXT: scratch_load_u16 v31, off, s32
; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:68
; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:72
-; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:124
-; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:128
-; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:64
-; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:60
-; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:120
-; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:56
-; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:116
-; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:52
-; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:112
-; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:48
-; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:108
-; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:44
-; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:104
-; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:40
-; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:100
-; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:36
-; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:96
-; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:32
-; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:92
-; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:28
-; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:88
-; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:24
-; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:84
-; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:20
-; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:76
+; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:76
+; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:124
+; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:128
+; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:64
+; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:60
+; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:120
+; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:56
+; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:116
+; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:52
+; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:112
+; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:48
+; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:108
+; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:44
+; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:104
+; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:40
+; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:100
+; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:36
+; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:96
+; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:32
+; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:92
+; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:28
+; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:88
+; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:24
+; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:84
+; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:20
; GFX11TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:80
; GFX11TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:16
; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:12
@@ -45823,45 +45790,45 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v26
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v31
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v35.l, v36.l, s26
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v34.l, v37.l, s27
-; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v34.h, v37.h, s28
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v38.l, v39.l, s29
-; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v38.h, v39.h, s25
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v48.l, v49.l, s24
-; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v48.h, v49.h, s23
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v50.l, v51.l, s22
-; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v50.h, v51.h, s21
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v52.l, v53.l, s20
-; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v52.h, v53.h, s19
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v54.l, v55.l, s18
-; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v54.h, v55.h, s17
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v64.l, v65.l, s16
-; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v64.h, v65.h, s15
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v66.l, v67.l, s14
-; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v66.h, v67.h, s13
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v68.l, v69.l, s12
-; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v68.h, v69.h, s11
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v70.l, v71.l, s10
-; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v70.h, v71.h, s9
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v80.l, v81.l, s8
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v80.h, v81.h, s7
+; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v36.l, v37.l, s26
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(25)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v38.l, s27
+; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v35.h, v38.h, s28
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(23)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v39.l, v48.l, s29
+; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v39.h, v48.h, s25
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(21)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v49.l, v50.l, s24
+; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v49.h, v50.h, s23
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(19)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v51.l, v52.l, s22
+; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v51.h, v52.h, s21
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(17)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v53.l, v54.l, s20
+; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v53.h, v54.h, s19
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(15)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v55.l, v64.l, s18
+; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v55.h, v64.h, s17
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(13)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v65.l, v66.l, s16
+; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v65.h, v66.h, s15
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(11)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v67.l, v68.l, s14
+; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v67.h, v68.h, s13
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(9)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v69.l, v70.l, s12
+; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v69.h, v70.h, s11
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v71.l, v80.l, s10
+; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v71.h, v80.h, s9
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v81.l, v82.l, s8
+; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v81.h, v82.h, s7
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v83.l, v84.l, s6
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v82.l, v85.l, s4
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v34.l, v85.l, s4
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v33.l, v86.l, s2
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -45869,9 +45836,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v16
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v32.h, v87.h, vcc_lo
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v33.h, v86.h, s1
-; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v82.h, v85.h, s3
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v34.h, v85.h, s3
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v83.h, v84.h, s5
-; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v35.h, v36.h, s0
+; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v36.h, v37.h, s0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v32bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll
index dd389375b0d77..2b6d9cc349278 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx11.ll
@@ -18,15 +18,15 @@ define amdgpu_kernel void @long_forward_branch_gfx11plus(ptr addrspace(1) %in, p
; GFX11-NEXT: s_setpc_b64 s[6:7]
; GFX11-NEXT: .LBB0_1: ; %bb2
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_d16_b16 v0, v1, s[0:1]
-; GFX11-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2
+; GFX11-NEXT: global_load_u16 v0, v2, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v2, s[0:1] offset:2
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
+; GFX11-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_d16_hi_b16 v1, v0, s[2:3] offset:2
+; GFX11-NEXT: global_store_b16 v2, v1, s[2:3] offset:2
; GFX11-NEXT: .LBB0_2: ; %bb3
; GFX11-NEXT: s_endpgm
bb0:
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index b8dd377377dab..0eab82778c8db 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -5091,7 +5091,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 8
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v0, off, off offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u8 v0, off, off offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, off offset:12
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index d4581672dab39..d07bce4ad45d5 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -48,25 +48,15 @@ define <2 x half> @chain_hi_to_lo_private() {
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_private:
-; GFX11-TRUE16: ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 2
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s0
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_private:
-; GFX11-FAKE16: ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, 2
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s0
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v0, off, s0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_private:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, 2
+; GFX11-NEXT: scratch_load_u16 v0, off, s0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: scratch_load_d16_hi_b16 v0, off, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds half, ptr addrspace(5) null, i64 1
%load_lo = load half, ptr addrspace(5) %gep_lo
@@ -114,21 +104,13 @@ define <2 x half> @chain_hi_to_lo_private_different_bases(ptr addrspace(5) %base
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_private_different_bases:
-; GFX11-TRUE16: ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, v0, off
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, v1, off
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_private_different_bases:
-; GFX11-FAKE16: ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: scratch_load_u16 v0, v0, off
-; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v0, v1, off
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_private_different_bases:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: scratch_load_u16 v0, v0, off
+; GFX11-NEXT: scratch_load_d16_hi_b16 v0, v1, off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%load_lo = load half, ptr addrspace(5) %base_lo
%load_hi = load half, ptr addrspace(5) %base_hi
@@ -325,29 +307,17 @@ define <2 x half> @chain_hi_to_lo_global() {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_global:
-; GFX11-TRUE16: ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_global:
-; GFX11-FAKE16: ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_global:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 2
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_load_d16_hi_b16 v0, v[1:2], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds half, ptr addrspace(1) null, i64 1
%load_lo = load half, ptr addrspace(1) %gep_lo
@@ -377,21 +347,13 @@ define <2 x half> @chain_hi_to_lo_global_different_bases(ptr addrspace(1) %base_
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_global_different_bases:
-; GFX11-TRUE16: ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_different_bases:
-; GFX11-FAKE16: ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_global_different_bases:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%load_lo = load half, ptr addrspace(1) %base_lo
%load_hi = load half, ptr addrspace(1) %base_hi
@@ -459,29 +421,17 @@ define <2 x half> @chain_hi_to_lo_flat(ptr inreg %ptr) {
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat:
-; GFX11-TRUE16: ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:2
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat:
-; GFX11-FAKE16: ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1] offset:2
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_flat:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_load_u16 v0, v[0:1] offset:2
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds half, ptr %ptr, i64 1
%load_lo = load half, ptr %gep_lo
@@ -512,23 +462,14 @@ define <2 x half> @chain_hi_to_lo_flat_different_bases(ptr %base_lo, ptr %base_h
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_different_bases:
-; GFX11-TRUE16: ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1]
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_different_bases:
-; GFX11-FAKE16: ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_flat_different_bases:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_u16 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%load_lo = load half, ptr %base_lo
%load_hi = load half, ptr %base_hi
@@ -677,23 +618,25 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v2, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1] offset:2
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:2 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1] offset:4
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, off offset:2
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, off
+; GFX11-TRUE16-NEXT: scratch_load_u16 v3, off, off offset:2
+; GFX11-TRUE16-NEXT: scratch_load_u16 v0, off, off
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[2:3]
@@ -962,13 +905,14 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
; GFX11-TRUE16-LABEL: chain_hi_to_lo_global_other_dep:
; GFX11-TRUE16: ; %bb.0: ; %bb
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[0:1], off offset:2 glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v[0:1], off offset:2 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_other_dep:
@@ -1035,14 +979,15 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_other_dep:
; GFX11-TRUE16: ; %bb.0: ; %bb
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc
+; GFX11-TRUE16-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep:
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index ccdc0b1bf43c4..5d74fe3d3c470 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -490,7 +490,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, v0.l, 1.0 clamp
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -573,7 +573,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, v0.l, 1.0 clamp
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -1555,18 +1555,18 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
; GFX11-TRUE16-LABEL: v_no_clamp_add_src_v2f16_f16_src:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 1.0, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 clamp
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_no_clamp_add_src_v2f16_f16_src:
@@ -1969,7 +1969,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm_minimumnum_maximumnum(ptr
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, v0.l, 1.0 clamp
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -2052,7 +2052,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals_minimumnum_maximumnu
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, v0.l, 1.0 clamp
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 5eb6b2f58474d..a8ab47e44afd5 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -594,7 +594,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, v0.l, v0.l clamp
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -620,7 +620,7 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_max_num_f16_e64 v0.l, v0.l, v0.l clamp
; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -702,7 +702,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l clamp
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -728,7 +728,7 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_max_num_f16_e64 v0.l, -v0.l, -v0.l clamp
; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -811,7 +811,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -|v0.l|, -|v0.l| clamp
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -837,7 +837,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_max_num_f16_e64 v0.l, -|v0.l|, -|v0.l| clamp
; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 5fb50d0d89530..e46f94e642a50 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=kaveri < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+real-true-d16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-D16-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
declare half @llvm.fabs.f16(half) #0
@@ -57,6 +58,15 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_undef_value_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_fold_canonicalize_undef_value_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -111,12 +121,23 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v0, off
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_var_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-D16-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v[0:1], v0, off
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: v_test_canonicalize_var_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -179,6 +200,17 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: s_test_canonicalize_var_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_clause 0x1
+; GFX11-D16-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: v_max_f16_e64 v0.l, s2, s2
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: s_test_canonicalize_var_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
@@ -229,6 +261,14 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1
; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_build_vector_v2f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-D16-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-D16-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-D16-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-FAKE16-LABEL: v_test_canonicalize_build_vector_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -284,12 +324,23 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, |v0.l|, |v0.l|
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_fabs_var_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-D16-TRUE16-NEXT: v_max_f16_e64 v0.l, |v0.l|, |v0.l|
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: v_test_canonicalize_fabs_var_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -349,12 +400,23 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -|v0.l|, -|v0.l|
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-D16-TRUE16-NEXT: v_max_f16_e64 v0.l, -|v0.l|, -|v0.l|
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -415,12 +477,23 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_fneg_var_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-D16-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: v_test_canonicalize_fneg_var_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -480,12 +553,23 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-D16-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -545,12 +629,23 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -|v0.l|, -|v0.l|
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-D16-TRUE16-NEXT: v_max_f16_e64 v0.l, -|v0.l|, -|v0.l|
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -607,6 +702,15 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out)
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_p0_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_fold_canonicalize_p0_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -658,6 +762,15 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out)
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_n0_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x8000
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_fold_canonicalize_n0_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -709,6 +822,15 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out)
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_p1_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3c00
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_fold_canonicalize_p1_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -760,6 +882,15 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out)
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_n1_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0xbc00
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_fold_canonicalize_n1_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -811,6 +942,15 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_literal_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x4c00
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_fold_canonicalize_literal_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -862,6 +1002,15 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3ff
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -913,6 +1062,15 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3ff
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -964,6 +1122,15 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x83ff
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -1015,6 +1182,15 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x83ff
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -1066,6 +1242,15 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_qnan_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7c00
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_fold_canonicalize_qnan_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -1117,6 +1302,15 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7e00
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -1168,6 +1362,15 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7e00
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -1219,6 +1422,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_snan0_value_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7e00
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_fold_canonicalize_snan0_value_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -1270,6 +1482,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_snan1_value_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7e00
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_fold_canonicalize_snan1_value_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -1321,6 +1542,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_snan2_value_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7e00
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_fold_canonicalize_snan2_value_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -1372,6 +1602,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
+; GFX11-D16-TRUE16-LABEL: test_fold_canonicalize_snan3_value_f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7e00
+; GFX11-D16-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-D16-TRUE16-NEXT: s_endpgm
+;
; GFX11-FAKE16-LABEL: test_fold_canonicalize_snan3_value_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -2572,6 +2811,14 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_reg_undef_v2f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-D16-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-D16-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 0
+; GFX11-D16-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2611,6 +2858,12 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.l, v0.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_undef_reg_v2f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.l, v0.l
+; GFX11-D16-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-FAKE16-LABEL: v_test_canonicalize_undef_reg_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2774,6 +3027,14 @@ define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 2.0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_reg_k_v2f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-D16-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-D16-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 2.0
+; GFX11-D16-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_k_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2818,6 +3079,14 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, 2.0, v0.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_k_reg_v2f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-D16-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-D16-TRUE16-NEXT: v_pack_b32_f16 v0, 2.0, v0.l
+; GFX11-D16-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-FAKE16-LABEL: v_test_canonicalize_k_reg_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2913,6 +3182,15 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0x7e007e00
+; GFX11-D16-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-D16-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 0
+; GFX11-D16-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2965,6 +3243,15 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-D16-TRUE16-NEXT: v_mov_b32_e32 v1, 0x7e007e00
+; GFX11-D16-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-D16-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-D16-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3022,6 +3309,16 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
; GFX11-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
+; GFX11-D16-TRUE16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
+; GFX11-D16-TRUE16: ; %bb.0:
+; GFX11-D16-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-D16-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-D16-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-D16-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-D16-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, 0
+; GFX11-D16-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-D16-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index d32b528d13276..f79cc042ef4a4 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -2410,7 +2410,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[2:3]
@@ -2436,7 +2436,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX12-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[2:3]
@@ -2792,7 +2792,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[2:3]
@@ -2818,7 +2818,7 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX12-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 210e09fd9169a..784363035e7de 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -137,33 +137,33 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v3, v1, s[4:5] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v2.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v0
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v4, v6 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v0
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v4, v6 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v5, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_fdiv_f16:
@@ -293,7 +293,7 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -400,7 +400,7 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rcp_f16_e64 v0.l, |v0.l|
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -510,7 +510,7 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -604,7 +604,7 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -711,7 +711,7 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rcp_f16_e64 v0.l, -v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -821,7 +821,7 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -935,7 +935,7 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
@@ -1058,12 +1058,12 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.h, v0.l
-; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] dlc
+; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1] dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_rsq_f16_multi_use:
@@ -1177,7 +1177,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
@@ -1295,7 +1295,7 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
@@ -1413,7 +1413,7 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r,
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
@@ -1536,13 +1536,13 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[4:5] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v2.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -1666,13 +1666,13 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[4:5] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v2.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -1758,7 +1758,7 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 {
;
; GFX11-TRUE16-LABEL: div_afn_2_x_pat_f16:
; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -1834,7 +1834,7 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 {
;
; GFX11-TRUE16-LABEL: div_afn_k_x_pat_f16:
; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -1910,7 +1910,7 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 {
;
; GFX11-TRUE16-LABEL: div_afn_neg_k_x_pat_f16:
; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
index 91f9aa1c5fe3b..909edc6dc0055 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -3,7 +3,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,CIVI,CIVI-HSA %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10PLUS %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16,+real-true-d16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11-FAKE16 %s
; GCN-LABEL: {{^}}store_flat_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index fc8883924dfbc..0e28d5c91a84f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -4585,7 +4585,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1
; GFX11-TRUE16-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v0, v0, off offset:-1 glc dlc
+; GFX11-TRUE16-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4610,7 +4610,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: scratch_load_d16_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4678,7 +4678,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
; GFX11-PAL-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1
; GFX11-PAL-TRUE16-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc
; GFX11-PAL-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-TRUE16-NEXT: scratch_load_d16_u8 v0, v0, off offset:-1 glc dlc
+; GFX11-PAL-TRUE16-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc
; GFX11-PAL-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4703,7 +4703,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
; GFX12-PAL-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-TRUE16-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
; GFX12-PAL-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-PAL-TRUE16-NEXT: scratch_load_d16_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
+; GFX12-PAL-TRUE16-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
; GFX12-PAL-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-PAL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4758,7 +4758,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
; GFX11-TRUE16-NEXT: scratch_store_b8 v1, v0, off offset:-129 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v0, v1, off offset:-129 glc dlc
+; GFX11-TRUE16-NEXT: scratch_load_u8 v0, v1, off offset:-129 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4783,7 +4783,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: scratch_load_d16_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4853,7 +4853,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
; GFX11-PAL-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
; GFX11-PAL-TRUE16-NEXT: scratch_store_b8 v1, v0, off offset:-129 dlc
; GFX11-PAL-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-TRUE16-NEXT: scratch_load_d16_u8 v0, v1, off offset:-129 glc dlc
+; GFX11-PAL-TRUE16-NEXT: scratch_load_u8 v0, v1, off offset:-129 glc dlc
; GFX11-PAL-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4878,7 +4878,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
; GFX12-PAL-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-TRUE16-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
; GFX12-PAL-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-PAL-TRUE16-NEXT: scratch_load_d16_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
+; GFX12-PAL-TRUE16-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
; GFX12-PAL-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-PAL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index 57be2907da4a0..f2eb47370c94b 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -7944,32 +7944,18 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) {
; GCN3-NEXT: flat_store_byte v[0:1], v2
; GCN3-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: atomic_load_i8_offset:
-; GFX11-TRUE16: ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:16 glc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_gl1_inv
-; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: atomic_load_i8_offset:
-; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: flat_load_u8 v2, v[0:1] offset:16 glc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_gl1_inv
-; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-FAKE16-NEXT: flat_store_b8 v[0:1], v2
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: atomic_load_i8_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_load_u8 v2, v[0:1] offset:16 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: flat_store_b8 v[0:1], v2
+; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr %in, i64 16
%val = load atomic i8, ptr %gep seq_cst, align 1
@@ -8020,32 +8006,18 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) {
; GCN3-NEXT: flat_store_byte v[0:1], v2
; GCN3-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: atomic_load_i8:
-; GFX11-TRUE16: ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_gl1_inv
-; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: atomic_load_i8:
-; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: flat_load_u8 v2, v[0:1] glc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_gl1_inv
-; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-FAKE16-NEXT: flat_store_b8 v[0:1], v2
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: atomic_load_i8:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_load_u8 v2, v[0:1] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: flat_store_b8 v[0:1], v2
+; GFX11-NEXT: s_endpgm
entry:
%val = load atomic i8, ptr %in seq_cst, align 1
store i8 %val, ptr %out
@@ -8108,42 +8080,23 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %
; GCN3-NEXT: flat_store_byte v[0:1], v2
; GCN3-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: atomic_load_i8_addr64_offset:
-; GFX11-TRUE16: ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s4
-; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:16 glc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_gl1_inv
-; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: atomic_load_i8_addr64_offset:
-; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, s4
-; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, s5
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: flat_load_u8 v2, v[0:1] offset:16 glc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_gl1_inv
-; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-FAKE16-NEXT: flat_store_b8 v[0:1], v2
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: atomic_load_i8_addr64_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s0, s0, s4
+; GFX11-NEXT: s_addc_u32 s1, s1, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_load_u8 v2, v[0:1] offset:16 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: flat_store_b8 v[0:1], v2
+; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i8, ptr %in, i64 %index
%gep = getelementptr i8, ptr %ptr, i64 16
@@ -8401,32 +8354,18 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) {
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: atomic_load_i16_offset:
-; GFX11-TRUE16: ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_gl1_inv
-; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: atomic_load_i16_offset:
-; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:16 glc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_gl1_inv
-; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: atomic_load_i16_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:16 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: flat_store_b16 v[0:1], v2
+; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i16, ptr %in, i64 8
%val = load atomic i16, ptr %gep seq_cst, align 2
@@ -8477,32 +8416,18 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) {
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: atomic_load_i16:
-; GFX11-TRUE16: ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] glc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_gl1_inv
-; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: atomic_load_i16:
-; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] glc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_gl1_inv
-; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: atomic_load_i16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_load_u16 v2, v[0:1] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: flat_store_b16 v[0:1], v2
+; GFX11-NEXT: s_endpgm
entry:
%val = load atomic i16, ptr %in seq_cst, align 2
store i16 %val, ptr %out
@@ -8568,44 +8493,24 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: atomic_load_i16_addr64_offset:
-; GFX11-TRUE16: ; %bb.0: ; %entry
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_lshl_b64 s[4:5], s[6:7], 1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s4
-; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_gl1_inv
-; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: atomic_load_i16_addr64_offset:
-; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_lshl_b64 s[4:5], s[6:7], 1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: s_add_u32 s0, s0, s4
-; GFX11-FAKE16-NEXT: s_addc_u32 s1, s1, s5
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:16 glc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_gl1_inv
-; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: atomic_load_i16_addr64_offset:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, s4
+; GFX11-NEXT: s_addc_u32 s1, s1, s5
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:16 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: flat_store_b16 v[0:1], v2
+; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i16, ptr %in, i64 %index
%gep = getelementptr i16, ptr %ptr, i64 8
@@ -10593,32 +10498,18 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) {
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: atomic_load_f16_offset:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_gl1_inv
-; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: atomic_load_f16_offset:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:16 glc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_gl1_inv
-; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: atomic_load_f16_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:16 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: flat_store_b16 v[0:1], v2
+; GFX11-NEXT: s_endpgm
%gep = getelementptr half, ptr %in, i64 8
%val = load atomic half, ptr %gep seq_cst, align 2
store half %val, ptr %out
@@ -10668,32 +10559,18 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) {
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: atomic_load_f16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] glc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_gl1_inv
-; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: atomic_load_f16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] glc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_gl1_inv
-; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: atomic_load_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_load_u16 v2, v[0:1] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: flat_store_b16 v[0:1], v2
+; GFX11-NEXT: s_endpgm
%val = load atomic half, ptr %in seq_cst, align 2
store half %val, ptr %out
ret void
@@ -10746,32 +10623,18 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) {
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: atomic_load_bf16_offset:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_gl1_inv
-; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: atomic_load_bf16_offset:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:16 glc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_gl1_inv
-; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: atomic_load_bf16_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:16 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: flat_store_b16 v[0:1], v2
+; GFX11-NEXT: s_endpgm
%gep = getelementptr bfloat, ptr %in, i64 8
%val = load atomic bfloat, ptr %gep seq_cst, align 2
store bfloat %val, ptr %out
@@ -10821,32 +10684,18 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: atomic_load_bf16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3
-; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] glc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_gl1_inv
-; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: atomic_load_bf16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] glc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_gl1_inv
-; GFX11-FAKE16-NEXT: buffer_gl0_inv
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: atomic_load_bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_load_u16 v2, v[0:1] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: flat_store_b16 v[0:1], v2
+; GFX11-NEXT: s_endpgm
%val = load atomic bfloat, ptr %in seq_cst, align 2
store bfloat %val, ptr %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll
index 5f86f2e48137d..38f85342aba69 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -stop-after=amdgpu-isel -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -stop-after=amdgpu-isel -mattr=+real-true16,+real-true-d16 < %s | FileCheck %s -check-prefixes=GFX11
@const_half = internal constant half 1.0
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index e59fbada6793d..c42e7e5b5fc3a 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16,+real-true-d16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
define amdgpu_ps float @test_fmaximum_f32_vv(float %a, float %b) {
@@ -312,11 +312,11 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] scope:SCOPE_SYS
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[4:5] scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: global_load_u16 v2, v1, s[4:5] scope:SCOPE_SYS
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_maximum_f16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_maximum_f16 v0.l, v0.l, v2.l
; GFX12-SDAG-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX12-SDAG-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 9233f8059a202..a173c820c8c6d 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -8,7 +8,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16,+real-true-d16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
@@ -7568,7 +7568,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
; GFX11-SDAG-TRUE16-NEXT: v_med3_f16 v0.l, v0.l, 2.0, 4.0
@@ -7816,15 +7816,15 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v0, v2, s[2:3] glc dlc
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] glc dlc
+; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v1, v2, s[4:5] glc dlc
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7] glc dlc
+; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v3, v2, s[6:7] glc dlc
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.h, 2.0, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v3.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_med3_f16 v0.l, v0.l, v0.h, v1.l
; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index b25120f2ece6f..8e5dcc550cccb 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16,+real-true-d16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
define amdgpu_ps float @test_fminimum_f32_vv(float %a, float %b) {
@@ -312,11 +312,11 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] scope:SCOPE_SYS
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[4:5] scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: global_load_u16 v2, v1, s[4:5] scope:SCOPE_SYS
; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_minimum_f16 v0.l, v0.l, v2.l
; GFX12-SDAG-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX12-SDAG-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 51b6d17312ed7..d258329128994 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -97,18 +97,18 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-FLUSH-TRUE16-LABEL: fmuladd_f16:
; GFX11-FLUSH-TRUE16: ; %bb.0:
; GFX11-FLUSH-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-FLUSH-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-FLUSH-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: s_clause 0x2
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[4:5]
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v3, v1, s[6:7]
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
-; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
; GFX11-FLUSH-FAKE16-LABEL: fmuladd_f16:
@@ -131,15 +131,15 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-DENORM-STRICT-TRUE16-LABEL: fmuladd_f16:
; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_clause 0x2
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v0, v1, s[6:7]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v3, v1, s[4:5]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v2.l, v3.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_f16:
@@ -159,15 +159,15 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fmuladd_f16:
; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_clause 0x2
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v0, v1, s[6:7]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v3, v1, s[4:5]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v2.l, v3.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_f16:
@@ -282,18 +282,18 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-FLUSH-TRUE16-LABEL: fmul_fadd_f16:
; GFX11-FLUSH-TRUE16: ; %bb.0:
; GFX11-FLUSH-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-FLUSH-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-FLUSH-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: s_clause 0x2
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[4:5]
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v3, v1, s[6:7]
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
-; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
; GFX11-FLUSH-FAKE16-LABEL: fmul_fadd_f16:
@@ -316,18 +316,18 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-DENORM-STRICT-TRUE16-LABEL: fmul_fadd_f16:
; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_clause 0x2
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v1, s[4:5]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v3, v1, s[6:7]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-FAKE16-LABEL: fmul_fadd_f16:
@@ -350,15 +350,15 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fmul_fadd_f16:
; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_clause 0x2
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v0, v1, s[6:7]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v3, v1, s[4:5]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v2.l, v3.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmul_fadd_f16:
@@ -458,18 +458,18 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
; GFX11-FLUSH-TRUE16-LABEL: fmul_fadd_contract_f16:
; GFX11-FLUSH-TRUE16: ; %bb.0:
; GFX11-FLUSH-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-FLUSH-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-FLUSH-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: s_clause 0x2
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[4:5]
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v3, v1, s[6:7]
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
-; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
; GFX11-FLUSH-FAKE16-LABEL: fmul_fadd_contract_f16:
@@ -492,15 +492,15 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
; GFX11-DENORM-STRICT-TRUE16-LABEL: fmul_fadd_contract_f16:
; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_clause 0x2
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v0, v1, s[6:7]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v3, v1, s[4:5]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v2.l, v3.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-FAKE16-LABEL: fmul_fadd_contract_f16:
@@ -520,15 +520,15 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fmul_fadd_contract_f16:
; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_clause 0x2
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v0, v1, s[6:7]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v3, v1, s[4:5]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v2.l, v3.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmul_fadd_contract_f16:
@@ -625,13 +625,13 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l
; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
@@ -657,14 +657,14 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_2.0_a_b_f16:
@@ -687,14 +687,14 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_2.0_a_b_f16:
@@ -795,13 +795,13 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l
; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
@@ -827,14 +827,14 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_a_2.0_b_f16:
@@ -857,14 +857,14 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_a_2.0_b_f16:
@@ -979,13 +979,13 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l
; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
@@ -1013,13 +1013,13 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l
; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
@@ -1045,14 +1045,14 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fadd_a_a_b_f16:
@@ -1170,13 +1170,13 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l
; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
@@ -1204,13 +1204,13 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l
; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
@@ -1236,14 +1236,14 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fadd_b_a_a_f16:
@@ -1347,13 +1347,13 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l
; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
@@ -1379,14 +1379,14 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, -2.0, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v2.l, -2.0, v1.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_neg_2.0_a_b_f16:
@@ -1409,14 +1409,14 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, -2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v2.l, -2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_neg_2.0_a_b_f16:
@@ -1517,13 +1517,13 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v2.l, v0.l
; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
@@ -1549,14 +1549,14 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
@@ -1579,14 +1579,14 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
@@ -1689,13 +1689,13 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l
; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
@@ -1721,14 +1721,14 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, -2.0, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v2.l, -2.0, v1.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_2.0_neg_a_b_f16:
@@ -1751,14 +1751,14 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, -2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v2.l, -2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_2.0_neg_a_b_f16:
@@ -1861,13 +1861,13 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v2.l
; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
@@ -1895,11 +1895,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, 2.0, -v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, 2.0, -v2.l
; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
@@ -1925,11 +1925,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, 2.0, -v0.h
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, 2.0, -v2.l
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
@@ -2065,18 +2065,18 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l
-; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v3.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
; GFX11-FLUSH-FAKE16-LABEL: mad_sub_f16:
@@ -2103,18 +2103,18 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v3.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-FAKE16-LABEL: mad_sub_f16:
@@ -2141,16 +2141,16 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, -v1.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v2.l, -v3.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: mad_sub_f16:
@@ -2289,18 +2289,18 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v1.l, v0.l
-; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
; GFX11-FLUSH-FAKE16-LABEL: mad_sub_inv_f16:
@@ -2327,18 +2327,18 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e32 v0.l, v1.l, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-FAKE16-LABEL: mad_sub_inv_f16:
@@ -2365,16 +2365,16 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v0.h, v1.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v2.l, v3.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: mad_sub_inv_f16:
@@ -2513,18 +2513,18 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e64 v0.l, v0.l, |v1.l|
-; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e64 v0.l, v0.l, |v3.l|
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
; GFX11-FLUSH-FAKE16-LABEL: mad_sub_fabs_f16:
@@ -2551,18 +2551,18 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e64 v0.l, v0.l, |v1.l|
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e64 v0.l, v0.l, |v3.l|
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-FAKE16-LABEL: mad_sub_fabs_f16:
@@ -2589,16 +2589,16 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, -|v1.l|
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v2.l, -|v3.l|
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: mad_sub_fabs_f16:
@@ -2738,18 +2738,18 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e64 v0.l, |v1.l|, v0.l
-; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e64 v0.l, |v3.l|, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
; GFX11-FLUSH-FAKE16-LABEL: mad_sub_fabs_inv_f16:
@@ -2776,18 +2776,18 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e64 v0.l, |v1.l|, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e64 v0.l, |v3.l|, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-FAKE16-LABEL: mad_sub_fabs_inv_f16:
@@ -2814,16 +2814,16 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v0.h, |v1.l|
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v2.l, |v3.l|
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: mad_sub_fabs_inv_f16:
@@ -2963,18 +2963,18 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v1.l, v0.l
-; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v3.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
; GFX11-FLUSH-FAKE16-LABEL: neg_neg_mad_f16:
@@ -3001,18 +3001,18 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v1.l, v0.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v3.l, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-FAKE16-LABEL: neg_neg_mad_f16:
@@ -3039,16 +3039,16 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v1.l, v2.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v0, v3, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: neg_neg_mad_f16:
@@ -3189,18 +3189,18 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, |v0.h|
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, |v2.l|
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l
-; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v3.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
; GFX11-FLUSH-FAKE16-LABEL: mad_fabs_sub_f16:
@@ -3227,18 +3227,18 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, |v0.h|
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, |v2.l|
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v3.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-FAKE16-LABEL: mad_fabs_sub_f16:
@@ -3265,16 +3265,16 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v3, v1, s[2:3] offset:4 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, |v0.h|, -v1.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, |v2.l|, -v3.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: mad_fabs_sub_f16:
@@ -3396,13 +3396,13 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l
; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
@@ -3430,13 +3430,13 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.h, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l
; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
@@ -3462,14 +3462,14 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, -2.0, v0.l
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v2.l, -2.0, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v0, v2, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fsub_c_fadd_a_a_f16:
@@ -3586,13 +3586,13 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v2.l
; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
;
@@ -3620,13 +3620,13 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v2.l
; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
;
@@ -3654,11 +3654,11 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_u16 v2, v1, s[0:1] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, 2.0, -v0.h
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, 2.0, -v2.l
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
index 64a9727330cfd..a3fa2f46538d2 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
@@ -399,7 +399,7 @@ define amdgpu_kernel void @v_fneg_fabs_bf16(ptr addrspace(1) %out, ptr addrspace
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 9d9a851a5507e..46212d8312d90 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -348,7 +348,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
index d232693b46ad9..eada9d55a75b2 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
@@ -134,7 +134,7 @@ define amdgpu_kernel void @v_fneg_bf16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -303,51 +303,28 @@ define amdgpu_kernel void @v_fneg_fold_bf16(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: v_fneg_fold_bf16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v0, s[2:3]
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_xor_b32_e32 v2, 0x8000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: v_fneg_fold_bf16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, v2, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: v_fneg_fold_bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
%val = load bfloat, ptr addrspace(1) %in
%fsub = fsub bfloat -0.0, %val
%fmul = fmul bfloat %fsub, %val
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index cab27fca5ab0a..b0213dd33ee36 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -132,7 +132,7 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -285,7 +285,7 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index f2fe61f5376e4..fadb6c0f4657a 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -1,7 +1,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-MUBUF,MUBUF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca,+enable-flat-scratch < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16,+real-true-d16 < %s | FileCheck --check-prefixes=GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11-FAKE16 %s
; Test that non-entry function frame indices are expanded properly to
diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
index 308e86bbaf8fd..350d93e3e55e8 100644
--- a/llvm/test/CodeGen/AMDGPU/freeze.ll
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -5563,29 +5563,13 @@ define void @freeze_i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: freeze_i16:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: freeze_i16:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: freeze_i16:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: freeze_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b16 v[2:3], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%a = load i16, ptr addrspace(1) %ptra
%freeze = freeze i16 %a
store i16 %freeze, ptr addrspace(1) %ptrb
@@ -6214,29 +6198,13 @@ define void @freeze_f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: freeze_f16:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: freeze_f16:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: freeze_f16:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: freeze_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b16 v[2:3], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%a = load half, ptr addrspace(1) %ptra
%freeze = freeze half %a
store half %freeze, ptr addrspace(1) %ptrb
@@ -6871,29 +6839,13 @@ define void @freeze_bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX10-NEXT: global_store_short v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: freeze_bf16:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: freeze_bf16:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: freeze_bf16:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: freeze_bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b16 v[2:3], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%a = load bfloat, ptr addrspace(1) %ptra
%freeze = freeze bfloat %a
store bfloat %freeze, ptr addrspace(1) %ptrb
@@ -12151,29 +12103,13 @@ define void @freeze_i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX10-NEXT: global_store_byte v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: freeze_i8:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[2:3], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: freeze_i8:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[2:3], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: freeze_i8:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[2:3], v0, off
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: freeze_i8:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[2:3], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%a = load i8, ptr addrspace(1) %ptra
%freeze = freeze i8 %a
store i8 %freeze, ptr addrspace(1) %ptrb
@@ -12287,21 +12223,13 @@ define void @freeze_v2i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX10-GISEL-NEXT: global_store_short v[2:3], v0, off
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: freeze_v2i8:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: freeze_v2i8:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: freeze_v2i8:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: freeze_v2i8:
; GFX11-GISEL: ; %bb.0:
@@ -13451,7 +13379,7 @@ define void @freeze_v2i1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX11-SDAG-TRUE16-LABEL: freeze_v2i1:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 3
; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[2:3], v0, off
@@ -13626,7 +13554,7 @@ define void @freeze_v3i1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX11-SDAG-TRUE16-LABEL: freeze_v3i1:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 7
; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[2:3], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 35913b9a21d30..ffdd2cde515b5 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -189,37 +189,35 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v3, v1, s[4:5] offset:8
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: frem_f16:
@@ -266,35 +264,34 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-TRUE16-NEXT: s_clause 0x1
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: global_load_u16 v3, v2, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_u16 v4, v2, s[4:5] offset:8
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.l
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v4.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v1, v1
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v4, v0, v3 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v1
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v4, v0, v3 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v1, v5, v1
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
-; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v1.l
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
-; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v4.l
+; GFX1150-TRUE16-NEXT: global_store_b16 v2, v3, s[0:1]
; GFX1150-TRUE16-NEXT: s_endpgm
;
; GFX1150-FAKE16-LABEL: frem_f16:
@@ -342,35 +339,34 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1200-TRUE16-NEXT: s_clause 0x1
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: global_load_u16 v3, v2, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_u16 v4, v2, s[4:5] offset:8
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
-; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.l
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v4.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v1, v1
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v4, v0, v3 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v1
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v4, v0, v3 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v1, v5, v1
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
-; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v1.l
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
-; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v4.l
+; GFX1200-TRUE16-NEXT: global_store_b16 v2, v3, s[0:1]
; GFX1200-TRUE16-NEXT: s_endpgm
;
; GFX1200-FAKE16-LABEL: frem_f16:
@@ -539,19 +535,19 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v3, v1, s[4:5] offset:8
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v3.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v2.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
-; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v1.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: fast_frem_f16:
@@ -579,20 +575,20 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150-TRUE16-NEXT: s_clause 0x1
; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-TRUE16-NEXT: s_clause 0x1
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_u16 v3, v1, s[4:5] offset:8
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v3.l
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
-; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v0.l, v2.l, v0.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
-; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1150-TRUE16-NEXT: s_endpgm
;
; GFX1150-FAKE16-LABEL: fast_frem_f16:
@@ -621,20 +617,20 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1200-TRUE16-NEXT: s_clause 0x1
; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1200-TRUE16-NEXT: s_clause 0x1
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1200-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_u16 v3, v1, s[4:5] offset:8
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v3.l
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
-; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v0.l, v2.l, v0.l
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
-; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1200-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1200-TRUE16-NEXT: s_endpgm
;
; GFX1200-FAKE16-LABEL: fast_frem_f16:
@@ -787,19 +783,19 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v3, v1, s[4:5] offset:8
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v3.l
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v2.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
-; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v1.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: unsafe_frem_f16:
@@ -827,20 +823,20 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1150-TRUE16-NEXT: s_clause 0x1
; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-TRUE16-NEXT: s_clause 0x1
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_u16 v3, v1, s[4:5] offset:8
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v3.l
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
-; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v0.l, v2.l, v0.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
-; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1150-TRUE16-NEXT: s_endpgm
;
; GFX1150-FAKE16-LABEL: unsafe_frem_f16:
@@ -869,20 +865,20 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1200-TRUE16-NEXT: s_clause 0x1
; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1200-TRUE16-NEXT: s_clause 0x1
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1200-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_u16 v3, v1, s[4:5] offset:8
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v3.l
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
-; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v0.l, v2.l, v0.l
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
-; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1200-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1200-TRUE16-NEXT: s_endpgm
;
; GFX1200-FAKE16-LABEL: unsafe_frem_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 3c41cc43a089e..5f083fcd2b6d0 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -1648,81 +1648,86 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
; GFX11-TRUE16-LABEL: void_func_v32i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v31, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_u8 v31, off, s32
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, 0
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v3.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v11.h, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v20, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v9.h, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v0.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v16, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v6.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v0.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v6.h, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v7.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
-; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 16
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.h, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v4.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.h
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v31.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.l, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v14, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v5.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v31.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v13.h, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v14.l, v14.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v8.l, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v10.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v32.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v10.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v19, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.l, v4.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v32
; GFX11-TRUE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0
-; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 16
; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -2669,33 +2674,19 @@ define void @void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %
; CIGFX89-NEXT: s_waitcnt vmcnt(0)
; CIGFX89-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: void_func_byval_struct_i8_i32:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v0, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s32 offset:4
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[0:3], 0
-; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: void_func_byval_struct_i8_i32:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_u8 v1, off, s32
-; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v1, off, s[0:3], 0
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: void_func_byval_struct_i8_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4
+; GFX11-NEXT: scratch_load_u8 v1, off, s32
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_store_b8 v1, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%arg0.load = load { i8, i32 }, ptr addrspace(5) %arg0
store { i8, i32 } %arg0.load, ptr addrspace(1) poison
ret void
@@ -2779,55 +2770,30 @@ define void @void_func_byval_struct_i8_i32_x2(ptr addrspace(5) byval({ i8, i32 }
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: void_func_byval_struct_i8_i32_x2:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v1, off, s32 glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v2, off, s32 offset:8 glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: scratch_load_b32 v4, off, s32 offset:12 glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: buffer_store_b32 v3, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b8 v1, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b32 v4, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b8 v2, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: ds_store_b32 v0, v0
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: void_func_byval_struct_i8_i32_x2:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: scratch_load_u8 v1, off, s32 glc dlc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: scratch_load_b32 v2, off, s32 offset:4 glc dlc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: scratch_load_u8 v3, off, s32 offset:8 glc dlc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: scratch_load_b32 v4, off, s32 offset:12 glc dlc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
-; GFX11-FAKE16-NEXT: buffer_store_b32 v2, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b8 v1, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b32 v4, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b8 v3, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: ds_store_b32 v0, v0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: void_func_byval_struct_i8_i32_x2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: scratch_load_u8 v1, off, s32 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:4 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: scratch_load_u8 v3, off, s32 offset:8 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: scratch_load_b32 v4, off, s32 offset:12 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v2, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v1, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b32 v4, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v3, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ds_store_b32 v0, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%arg0.load = load volatile { i8, i32 }, ptr addrspace(5) %arg0
%arg1.load = load volatile { i8, i32 }, ptr addrspace(5) %arg1
store volatile { i8, i32 } %arg0.load, ptr addrspace(1) poison
@@ -3032,99 +2998,52 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: void_func_v32i32_i1_i8_i16_bf16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_u8 v36, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:20
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v36
-; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v32, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: buffer_store_b16 v33, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: buffer_store_b16 v34, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_store_b16 v35, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: void_func_v32i32_i1_i8_i16_bf16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x5
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u8 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT: scratch_load_u16 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u16 v34, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u16 v35, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u16 v36, off, s32 offset:20
-; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 1, v32
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b8 v16, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-FAKE16-NEXT: buffer_store_b16 v34, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: buffer_store_b16 v35, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_store_b16 v36, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: void_func_v32i32_i1_i8_i16_bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x5
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: scratch_load_u8 v32, off, s32 offset:4
+; GFX11-NEXT: scratch_load_u16 v33, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u16 v34, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u16 v35, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u16 v36, off, s32 offset:20
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_waitcnt vmcnt(5)
+; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(4)
+; GFX11-NEXT: v_and_b32_e32 v16, 1, v32
+; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b8 v16, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: buffer_store_b16 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: buffer_store_b16 v35, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_store_b16 v36, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
store volatile <32 x i32> %arg0, ptr addrspace(1) poison
store volatile i1 %arg1, ptr addrspace(1) poison
store volatile i8 %arg2, ptr addrspace(1) poison
@@ -4536,185 +4455,95 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: void_func_v32i32_v16i8:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x10
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v32, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v33, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v34, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v35, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v36, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v37, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v38, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v39, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v48, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v49, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v50, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v51, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v52, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v53, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v54, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v55, off, s32 offset:4
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v32, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v34, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v35, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v36, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v37, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v38, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v39, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v48, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v49, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v50, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v51, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v52, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v53, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v54, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_store_b8 v55, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: void_func_v32i32_v16i8:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x10
-; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT: scratch_load_u8 v32, off, s32 offset:64
-; GFX11-FAKE16-NEXT: scratch_load_u8 v33, off, s32 offset:60
-; GFX11-FAKE16-NEXT: scratch_load_u8 v34, off, s32 offset:56
-; GFX11-FAKE16-NEXT: scratch_load_u8 v35, off, s32 offset:52
-; GFX11-FAKE16-NEXT: scratch_load_u8 v36, off, s32 offset:48
-; GFX11-FAKE16-NEXT: scratch_load_u8 v37, off, s32 offset:44
-; GFX11-FAKE16-NEXT: scratch_load_u8 v38, off, s32 offset:40
-; GFX11-FAKE16-NEXT: scratch_load_u8 v39, off, s32 offset:36
-; GFX11-FAKE16-NEXT: scratch_load_u8 v48, off, s32 offset:32
-; GFX11-FAKE16-NEXT: scratch_load_u8 v49, off, s32 offset:28
-; GFX11-FAKE16-NEXT: scratch_load_u8 v50, off, s32 offset:24
-; GFX11-FAKE16-NEXT: scratch_load_u8 v51, off, s32 offset:20
-; GFX11-FAKE16-NEXT: scratch_load_u8 v52, off, s32 offset:16
-; GFX11-FAKE16-NEXT: scratch_load_u8 v53, off, s32 offset:12
-; GFX11-FAKE16-NEXT: scratch_load_u8 v54, off, s32 offset:8
-; GFX11-FAKE16-NEXT: scratch_load_u8 v55, off, s32 offset:4
-; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v32, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v34, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v35, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v36, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v37, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v38, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v39, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v48, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v49, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v50, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v51, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v52, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v53, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v54, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: buffer_store_b8 v55, off, s[0:3], 0 dlc
-; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: void_func_v32i32_v16i8:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x10
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: scratch_load_u8 v32, off, s32 offset:64
+; GFX11-NEXT: scratch_load_u8 v33, off, s32 offset:60
+; GFX11-NEXT: scratch_load_u8 v34, off, s32 offset:56
+; GFX11-NEXT: scratch_load_u8 v35, off, s32 offset:52
+; GFX11-NEXT: scratch_load_u8 v36, off, s32 offset:48
+; GFX11-NEXT: scratch_load_u8 v37, off, s32 offset:44
+; GFX11-NEXT: scratch_load_u8 v38, off, s32 offset:40
+; GFX11-NEXT: scratch_load_u8 v39, off, s32 offset:36
+; GFX11-NEXT: scratch_load_u8 v48, off, s32 offset:32
+; GFX11-NEXT: scratch_load_u8 v49, off, s32 offset:28
+; GFX11-NEXT: scratch_load_u8 v50, off, s32 offset:24
+; GFX11-NEXT: scratch_load_u8 v51, off, s32 offset:20
+; GFX11-NEXT: scratch_load_u8 v52, off, s32 offset:16
+; GFX11-NEXT: scratch_load_u8 v53, off, s32 offset:12
+; GFX11-NEXT: scratch_load_u8 v54, off, s32 offset:8
+; GFX11-NEXT: scratch_load_u8 v55, off, s32 offset:4
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_waitcnt vmcnt(16)
+; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(15)
+; GFX11-NEXT: buffer_store_b8 v32, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(14)
+; GFX11-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(13)
+; GFX11-NEXT: buffer_store_b8 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(12)
+; GFX11-NEXT: buffer_store_b8 v35, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(11)
+; GFX11-NEXT: buffer_store_b8 v36, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(10)
+; GFX11-NEXT: buffer_store_b8 v37, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(9)
+; GFX11-NEXT: buffer_store_b8 v38, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(8)
+; GFX11-NEXT: buffer_store_b8 v39, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(7)
+; GFX11-NEXT: buffer_store_b8 v48, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(6)
+; GFX11-NEXT: buffer_store_b8 v49, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(5)
+; GFX11-NEXT: buffer_store_b8 v50, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(4)
+; GFX11-NEXT: buffer_store_b8 v51, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: buffer_store_b8 v52, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: buffer_store_b8 v53, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: buffer_store_b8 v54, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_store_b8 v55, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
store volatile <32 x i32> %arg0, ptr addrspace(1) poison
store volatile <16 x i8> %arg1, ptr addrspace(1) poison
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index f67ab18dd8ef1..5883e807964e4 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -706,63 +706,34 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: test_call_external_void_func_i8_signext:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, s33
-; GFX11-TRUE16-NEXT: s_mov_b32 s33, s32
-; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: global_load_d16_i8 v0, v[0:1], off glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i8_signext at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i8_signext at abs32@lo
-; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: s_mov_b32 s33, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: test_call_external_void_func_i8_signext:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, s33
-; GFX11-FAKE16-NEXT: s_mov_b32 s33, s32
-; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: global_load_i8 v0, v[0:1], off glc dlc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i8_signext at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i8_signext at abs32@lo
-; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
-; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: s_mov_b32 s33, s0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_call_external_void_func_i8_signext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: global_load_i8 v0, v[0:1], off glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_i8_signext at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_i8_signext at abs32@lo
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
+; GFX11-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: s_mov_b32 s33, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_signext:
; GFX10-SCRATCH: ; %bb.0:
@@ -857,63 +828,34 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: test_call_external_void_func_i8_zeroext:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, s33
-; GFX11-TRUE16-NEXT: s_mov_b32 s33, s32
-; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i8_zeroext at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i8_zeroext at abs32@lo
-; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: s_mov_b32 s33, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: test_call_external_void_func_i8_zeroext:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, s33
-; GFX11-FAKE16-NEXT: s_mov_b32 s33, s32
-; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i8_zeroext at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i8_zeroext at abs32@lo
-; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
-; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: s_mov_b32 s33, s0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_call_external_void_func_i8_zeroext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_i8_zeroext at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_i8_zeroext at abs32@lo
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
+; GFX11-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: s_mov_b32 s33, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_zeroext:
; GFX10-SCRATCH: ; %bb.0:
@@ -1153,63 +1095,34 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: test_call_external_void_func_i16_signext:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, s33
-; GFX11-TRUE16-NEXT: s_mov_b32 s33, s32
-; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i16_signext at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i16_signext at abs32@lo
-; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: s_mov_b32 s33, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: test_call_external_void_func_i16_signext:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, s33
-; GFX11-FAKE16-NEXT: s_mov_b32 s33, s32
-; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off glc dlc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i16_signext at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i16_signext at abs32@lo
-; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
-; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: s_mov_b32 s33, s0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_call_external_void_func_i16_signext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_i16_signext at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_i16_signext at abs32@lo
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
+; GFX11-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: s_mov_b32 s33, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_signext:
; GFX10-SCRATCH: ; %bb.0:
@@ -1304,63 +1217,34 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: test_call_external_void_func_i16_zeroext:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, s33
-; GFX11-TRUE16-NEXT: s_mov_b32 s33, s32
-; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i16_zeroext at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i16_zeroext at abs32@lo
-; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: s_mov_b32 s33, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: test_call_external_void_func_i16_zeroext:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, s33
-; GFX11-FAKE16-NEXT: s_mov_b32 s33, s32
-; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off glc dlc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i16_zeroext at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i16_zeroext at abs32@lo
-; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
-; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: s_mov_b32 s33, s0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_call_external_void_func_i16_zeroext:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_i16_zeroext at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_i16_zeroext at abs32@lo
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
+; GFX11-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: s_mov_b32 s33, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_zeroext:
; GFX10-SCRATCH: ; %bb.0:
@@ -3245,71 +3129,38 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: test_call_external_void_func_v2i8:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, s33
-; GFX11-TRUE16-NEXT: s_mov_b32 s33, s32
-; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
-; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: s_mov_b32 s33, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: test_call_external_void_func_v2i8:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, s33
-; GFX11-FAKE16-NEXT: s_mov_b32 s33, s32
-; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
-; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
-; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: s_mov_b32 s33, s0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_call_external_void_func_v2i8:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
+; GFX11-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: s_mov_b32 s33, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i8:
; GFX10-SCRATCH: ; %bb.0:
@@ -4297,77 +4148,41 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: test_call_external_void_func_i8_ret:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, s33
-; GFX11-TRUE16-NEXT: s_mov_b32 s33, s32
-; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
-; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 offset:4
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s33
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v40, 0
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v41, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s0, 2
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
-; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT: global_load_d16_u8 v0, v[40:41], off
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s30, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s31, 1
-; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: global_store_b8 v[40:41], v0, off
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
-; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
-; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: s_mov_b32 s33, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: test_call_external_void_func_i8_ret:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, s33
-; GFX11-FAKE16-NEXT: s_mov_b32 s33, s32
-; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
-; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 offset:4
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s33
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v40, 0
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v41, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s0, 2
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
-; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT: global_load_u8 v0, v[40:41], off
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s30, 0
-; GFX11-FAKE16-NEXT: v_writelane_b32 v42, s31, 1
-; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: global_store_b8 v[40:41], v0, off
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s33
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
-; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v42, 2
-; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
-; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: s_mov_b32 s33, s0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_call_external_void_func_i8_ret:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT: scratch_store_b32 off, v41, s33
+; GFX11-NEXT: v_mov_b32_e32 v40, 0
+; GFX11-NEXT: v_mov_b32_e32 v41, 0
+; GFX11-NEXT: v_writelane_b32 v42, s0, 2
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: global_load_u8 v0, v[40:41], off
+; GFX11-NEXT: v_writelane_b32 v42, s30, 0
+; GFX11-NEXT: v_writelane_b32 v42, s31, 1
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: global_store_b8 v[40:41], v0, off
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v41, off, s33
+; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4
+; GFX11-NEXT: v_readlane_b32 s31, v42, 1
+; GFX11-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
+; GFX11-NEXT: v_readlane_b32 s0, v42, 2
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: s_mov_b32 s33, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_ret:
; GFX10-SCRATCH: ; %bb.0:
@@ -4512,7 +4327,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_v2i8_ret at abs32@hi
; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_v2i8_ret at abs32@lo
; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[40:41], off
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v[40:41], off
; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s30, 0
; GFX11-TRUE16-NEXT: v_writelane_b32 v42, s31, 1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -9157,71 +8972,38 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: test_call_external_void_func_struct_i8_i32:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, s33
-; GFX11-TRUE16-NEXT: s_mov_b32 s33, s32
-; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_u8 v0, v1, s[0:1]
-; GFX11-TRUE16-NEXT: global_load_b32 v1, v1, s[0:1] offset:4
-; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_struct_i8_i32 at abs32@hi
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
-; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-TRUE16-NEXT: s_mov_b32 s33, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: test_call_external_void_func_struct_i8_i32:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, s33
-; GFX11-FAKE16-NEXT: s_mov_b32 s33, s32
-; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: global_load_u8 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT: global_load_b32 v1, v1, s[0:1] offset:4
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_struct_i8_i32 at abs32@hi
-; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
-; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
-; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-FAKE16-NEXT: s_mov_b32 s33, s0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_call_external_void_func_struct_i8_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s0, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_u8 v0, v1, s[0:1]
+; GFX11-NEXT: global_load_b32 v1, v1, s[0:1] offset:4
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: s_mov_b32 s1, external_void_func_struct_i8_i32 at abs32@hi
+; GFX11-NEXT: s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
+; GFX11-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: s_mov_b32 s33, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_struct_i8_i32:
; GFX10-SCRATCH: ; %bb.0:
@@ -9540,7 +9322,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v0, off, s33 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_u8 v0, off, s33 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s33 offset:12
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
index 63376def3d7e1..1afac03ada631 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
@@ -1,12 +1,12 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND,WORKAROUND-TRUE16-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16,+real-true-d16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND,WORKAROUND-TRUE16-SDAG %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND,WORKAROUND-FAKE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16,+real-true-d16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND,WORKAROUND-FAKE16 %s
; Does not apply to wave64
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16,+real-true-d16 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16,+real-true-d16 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
; Does not apply to gfx1101
diff --git a/llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll
index f92ba7a8978b9..e36ee94ad7cd8 100644
--- a/llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll
@@ -8,7 +8,7 @@ define amdgpu_kernel void @zextload_global_i8_to_i16(ptr addrspace(1) %out, ptr
; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-REAL16-NEXT: global_load_d16_u8 v0, v1, s[2:3]
+; GFX11-REAL16-NEXT: global_load_u8 v0, v1, s[2:3]
; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0)
; GFX11-REAL16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-REAL16-NEXT: s_endpgm
@@ -34,7 +34,7 @@ define amdgpu_kernel void @sextload_global_i8_to_i16(ptr addrspace(1) %out, ptr
; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-REAL16-NEXT: global_load_d16_i8 v0, v1, s[2:3]
+; GFX11-REAL16-NEXT: global_load_i8 v0, v1, s[2:3]
; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0)
; GFX11-REAL16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-REAL16-NEXT: s_endpgm
@@ -55,27 +55,16 @@ define amdgpu_kernel void @sextload_global_i8_to_i16(ptr addrspace(1) %out, ptr
}
define amdgpu_kernel void @zextload_global_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
-; GFX11-REAL16-LABEL: zextload_global_i8_to_i64:
-; GFX11-REAL16: ; %bb.0:
-; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-REAL16-NEXT: global_load_d16_u8 v0, v1, s[2:3]
-; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-REAL16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-REAL16-NEXT: global_store_b64 v1, v[0:1], s[0:1]
-; GFX11-REAL16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: zextload_global_i8_to_i64:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_u8 v0, v1, s[2:3]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: global_store_b64 v1, v[0:1], s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: zextload_global_i8_to_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%a = load i8, ptr addrspace(1) %in
%ext = zext i8 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -83,31 +72,18 @@ define amdgpu_kernel void @zextload_global_i8_to_i64(ptr addrspace(1) %out, ptr
}
define amdgpu_kernel void @sextload_global_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
-; GFX11-REAL16-LABEL: sextload_global_i8_to_i64:
-; GFX11-REAL16: ; %bb.0:
-; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-REAL16-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-REAL16-NEXT: global_load_d16_i8 v0, v2, s[2:3]
-; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-REAL16-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-REAL16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-REAL16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-REAL16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: sextload_global_i8_to_i64:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_i8 v0, v2, s[2:3]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: sextload_global_i8_to_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_i8 v0, v2, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%a = load i8, ptr addrspace(1) %in
%ext = sext i8 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -147,27 +123,16 @@ define amdgpu_kernel void @sextload_global_i16_to_i32(ptr addrspace(1) %out, ptr
}
define amdgpu_kernel void @zextload_global_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
-; GFX11-REAL16-LABEL: zextload_global_i16_to_i64:
-; GFX11-REAL16: ; %bb.0:
-; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-REAL16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
-; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-REAL16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-REAL16-NEXT: global_store_b64 v1, v[0:1], s[0:1]
-; GFX11-REAL16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: zextload_global_i16_to_i64:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_u16 v0, v1, s[2:3]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: global_store_b64 v1, v[0:1], s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: zextload_global_i16_to_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%a = load i16, ptr addrspace(1) %in
%ext = zext i16 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -175,31 +140,18 @@ define amdgpu_kernel void @zextload_global_i16_to_i64(ptr addrspace(1) %out, ptr
}
define amdgpu_kernel void @sextload_global_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
-; GFX11-REAL16-LABEL: sextload_global_i16_to_i64:
-; GFX11-REAL16: ; %bb.0:
-; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-REAL16-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-REAL16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-REAL16-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-REAL16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-REAL16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-REAL16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: sextload_global_i16_to_i64:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[2:3]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: sextload_global_i16_to_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%a = load i16, ptr addrspace(1) %in
%ext = sext i16 %a to i64
store i64 %ext, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 1602e31d6147c..e6ee27b4fb692 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -5,7 +5,7 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+real-true16,+real-true-d16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
; Test using saddr addressing mode of global_*load_* flat instructions.
@@ -2290,35 +2290,17 @@ define amdgpu_ps half @global_load_saddr_i16(ptr addrspace(1) inreg %sbase, i32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
-; GFX11-TRUE16-LABEL: global_load_saddr_i16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: ; return to shader part epilog
-;
-; GFX11-FAKE16-LABEL: global_load_saddr_i16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: ; return to shader part epilog
-;
-; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_i16:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: ; return to shader part epilog
-;
-; GFX12-SDAG-FAKE16-LABEL: global_load_saddr_i16:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3]
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i16:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX11-LABEL: global_load_saddr_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_u16 v0, v0, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i16, ptr addrspace(1) %gep0
@@ -2333,35 +2315,17 @@ define amdgpu_ps half @global_load_saddr_i16_immneg128(ptr addrspace(1) inreg %s
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
-; GFX11-TRUE16-LABEL: global_load_saddr_i16_immneg128:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: ; return to shader part epilog
-;
-; GFX11-FAKE16-LABEL: global_load_saddr_i16_immneg128:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: ; return to shader part epilog
-;
-; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_i16_immneg128:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: ; return to shader part epilog
-;
-; GFX12-SDAG-FAKE16-LABEL: global_load_saddr_i16_immneg128:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_immneg128:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX11-LABEL: global_load_saddr_i16_immneg128:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_i16_immneg128:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2377,35 +2341,17 @@ define amdgpu_ps half @global_load_saddr_f16(ptr addrspace(1) inreg %sbase, i32
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
-; GFX11-TRUE16-LABEL: global_load_saddr_f16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: ; return to shader part epilog
-;
-; GFX11-FAKE16-LABEL: global_load_saddr_f16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: ; return to shader part epilog
-;
-; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_f16:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: ; return to shader part epilog
-;
-; GFX12-SDAG-FAKE16-LABEL: global_load_saddr_f16:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3]
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_f16:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX11-LABEL: global_load_saddr_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_u16 v0, v0, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load half, ptr addrspace(1) %gep0
@@ -2419,35 +2365,17 @@ define amdgpu_ps half @global_load_saddr_f16_immneg128(ptr addrspace(1) inreg %s
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
-; GFX11-TRUE16-LABEL: global_load_saddr_f16_immneg128:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: ; return to shader part epilog
-;
-; GFX11-FAKE16-LABEL: global_load_saddr_f16_immneg128:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: ; return to shader part epilog
-;
-; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_f16_immneg128:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: ; return to shader part epilog
-;
-; GFX12-SDAG-FAKE16-LABEL: global_load_saddr_f16_immneg128:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_f16_immneg128:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: ; return to shader part epilog
+; GFX11-LABEL: global_load_saddr_f16_immneg128:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: global_load_saddr_f16_immneg128:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -5039,3 +4967,8 @@ bb3: ; preds = %bb3, %bb
!0 = !{i32 0, i32 1073741824} ; (1 << 30)
!1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-FAKE16: {{.*}}
+; GFX11-TRUE16: {{.*}}
+; GFX12-SDAG-FAKE16: {{.*}}
+; GFX12-SDAG-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index da132d0269e6b..cc653a5b4bd97 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -7451,7 +7451,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_u8 v0, v1, s[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT: global_load_u8 v0, v1, s[0:1] offset:16 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -7529,7 +7529,7 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_u8 v0, v1, s[0:1] offset:-512 glc
+; GFX11-TRUE16-NEXT: global_load_u8 v0, v1, s[0:1] offset:-512 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -7722,7 +7722,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:16 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -7800,7 +7800,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:-512 glc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:-512 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -9187,7 +9187,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:16 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -9264,7 +9264,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:-512 glc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:-512 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -9337,7 +9337,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:16 glc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:16 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -9414,7 +9414,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:-512 glc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:-512 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -9552,6 +9552,47 @@ define amdgpu_kernel void @atomic_sub_i16_soffset__amdgpu_no_remote_memory(ptr a
; GFX9-NEXT: s_cbranch_execnz .LBB136_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: atomic_sub_i16_soffset__amdgpu_no_remote_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s3, s0, 0x4650
+; GFX11-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-NEXT: s_and_b32 s0, s3, -4
+; GFX11-NEXT: s_and_b32 s3, s3, 3
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX11-NEXT: s_lshl_b32 s5, s3, 3
+; GFX11-NEXT: s_and_b32 s6, s2, 0xffff
+; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_not_b32 s3, s2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-NEXT: s_lshl_b32 s4, s6, s5
+; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: .LBB136_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_subrev_nc_u32_e32 v0, s4, v1
+; GFX11-NEXT: v_and_b32_e32 v0, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, v1, s3, v0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_cbranch_execnz .LBB136_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_endpgm
%gep = getelementptr i16, ptr addrspace(1) %out, i64 9000
%val = atomicrmw sub ptr addrspace(1) %gep, i16 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
@@ -9671,6 +9712,47 @@ define amdgpu_kernel void @atomic_sub_i8_soffset__amdgpu_no_remote_memory(ptr ad
; GFX9-NEXT: s_cbranch_execnz .LBB137_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: atomic_sub_i8_soffset__amdgpu_no_remote_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s3, s0, 0x2328
+; GFX11-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-NEXT: s_and_b32 s0, s3, -4
+; GFX11-NEXT: s_and_b32 s3, s3, 3
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX11-NEXT: s_lshl_b32 s5, s3, 3
+; GFX11-NEXT: s_and_b32 s6, s2, 0xff
+; GFX11-NEXT: s_lshl_b32 s2, 0xff, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_not_b32 s3, s2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-NEXT: s_lshl_b32 s4, s6, s5
+; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: .LBB137_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_subrev_nc_u32_e32 v0, s4, v1
+; GFX11-NEXT: v_and_b32_e32 v0, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, v1, s3, v0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_cbranch_execnz .LBB137_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %out, i64 9000
%val = atomicrmw sub ptr addrspace(1) %gep, i8 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 8e427a6ef2023..1bb4fb30465f3 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -967,7 +967,7 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
@@ -1100,12 +1100,12 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr
; GFX11-TRUE16-LABEL: global_extload_f16_to_f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: global_extload_f16_to_f32:
@@ -1694,7 +1694,7 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v2, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -3512,7 +3512,7 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-TRUE16-NEXT: s_endpgm
@@ -3554,7 +3554,7 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
index fc4cdcda99ae4..f7d90cbf45bcb 100644
--- a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
@@ -77,18 +77,18 @@ define amdgpu_kernel void @i16_eq(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -174,18 +174,18 @@ define amdgpu_kernel void @i16_ne(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -271,18 +271,18 @@ define amdgpu_kernel void @i16_ugt(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -368,18 +368,18 @@ define amdgpu_kernel void @i16_uge(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_ge_u16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_ge_u16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -465,18 +465,18 @@ define amdgpu_kernel void @i16_ult(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_lt_u16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_lt_u16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -562,18 +562,18 @@ define amdgpu_kernel void @i16_ule(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_le_u16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_le_u16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -660,18 +660,18 @@ define amdgpu_kernel void @i16_sgt(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -757,18 +757,18 @@ define amdgpu_kernel void @i16_sge(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_ge_i16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_ge_i16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -854,18 +854,18 @@ define amdgpu_kernel void @i16_slt(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -951,18 +951,18 @@ define amdgpu_kernel void @i16_sle(ptr addrspace(1) %out, ptr addrspace(1) %a.pt
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_le_i16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_le_i16_e32 vcc_lo, v2.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1039,17 +1039,17 @@ define amdgpu_kernel void @i16_eq_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX11-TRUE16-LABEL: i16_eq_v_s:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1123,17 +1123,17 @@ define amdgpu_kernel void @i16_ne_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX11-TRUE16-LABEL: i16_ne_v_s:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1207,17 +1207,17 @@ define amdgpu_kernel void @i16_ugt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: i16_ugt_v_s:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_lt_u16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_lt_u16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1291,17 +1291,17 @@ define amdgpu_kernel void @i16_uge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: i16_uge_v_s:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_le_u16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_le_u16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1375,17 +1375,17 @@ define amdgpu_kernel void @i16_ult_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: i16_ult_v_s:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1459,17 +1459,17 @@ define amdgpu_kernel void @i16_ule_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: i16_ule_v_s:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_ge_u16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_ge_u16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1543,17 +1543,17 @@ define amdgpu_kernel void @i16_sgt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: i16_sgt_v_s:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1627,17 +1627,17 @@ define amdgpu_kernel void @i16_sge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: i16_sge_v_s:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_le_i16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_le_i16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1711,17 +1711,17 @@ define amdgpu_kernel void @i16_slt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: i16_slt_v_s:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1795,17 +1795,17 @@ define amdgpu_kernel void @i16_sle_v_s(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-TRUE16-LABEL: i16_sle_v_s:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_ge_i16_e32 vcc_lo, s4, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_ge_i16_e32 vcc_lo, s4, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index ab38bd21994ec..46e803245433a 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -1159,42 +1159,43 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-LABEL: idot4_acc16_vecMul:
; GFX11-DL-TRUE16: ; %bb.0: ; %entry
; GFX11-DL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DL-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-TRUE16-NEXT: s_clause 0x1
; GFX11-DL-TRUE16-NEXT: global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v3, s[4:5]
+; GFX11-DL-TRUE16-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT: global_load_u16 v3, v2, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v2, 0, 8
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v0, 0, 8
; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v4.h, 8, v1.l
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v7.h, 8, v2.l
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v7.h, 8, v0.l
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8
; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v1.h, 8, v1.h
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v2, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v2.h, 8, v2.h
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v0, 0, 8
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v4, v4, v7
+; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v7.h, 8, v0.h
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, v0.l
+; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, v3.l
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v1, v2
+; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v1, v7
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v4.h
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.h
-; GFX11-DL-TRUE16-NEXT: global_store_b16 v3, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT: global_store_b16 v2, v0, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_endpgm
;
; GFX11-DL-FAKE16-LABEL: idot4_acc16_vecMul:
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 305461ed6b208..22060a2d63749 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -1669,37 +1669,35 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-LABEL: notdot4_mixedtypes:
; GFX11-DL-TRUE16: ; %bb.0: ; %entry
; GFX11-DL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-DL-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-TRUE16-NEXT: v_mov_b32_e32 v6, 0
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-TRUE16-NEXT: s_clause 0x1
-; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT: global_load_b32 v5, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v6, s[4:5]
+; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT: global_load_u16 v6, v5, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v4
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 8, v3
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v5
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v5, 0, 8
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v4
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v3, 0, 8
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v4, 0, 8
+; GFX11-DL-TRUE16-NEXT: v_perm_b32 v4, v4, v4, 0xc0c0302
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.l
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v6.l
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
-; GFX11-DL-TRUE16-NEXT: v_perm_b32 v1, v5, v5, 0xc0c0302
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v3.l, v0.l
-; GFX11-DL-TRUE16-NEXT: v_perm_b32 v2, v4, v4, 0xc0c0302
-; GFX11-DL-TRUE16-NEXT: v_dot4_u32_u8 v0, v2, v1, v0
-; GFX11-DL-TRUE16-NEXT: global_store_b16 v6, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v2.l, v0.l
+; GFX11-DL-TRUE16-NEXT: v_perm_b32 v1, v3, v3, 0xc0c0302
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT: v_dot4_u32_u8 v0, v1, v4, v0
+; GFX11-DL-TRUE16-NEXT: global_store_b16 v5, v0, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_endpgm
;
; GFX11-DL-FAKE16-LABEL: notdot4_mixedtypes:
@@ -1969,32 +1967,34 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: s_clause 0x1
; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[2:3]
; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v5, s[4:5]
+; GFX11-DL-TRUE16-NEXT: global_load_u16 v6, v5, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 8, v3
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v4
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v4, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.l
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v3
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v4
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v4, 0, 8
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.h
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 24, v4
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v0, 0, 8
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.l
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.l
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.l
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.h
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v4
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v1.h, v0.l
+; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v6.l
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v3
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.h
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v0.h, v0.l
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.h, v0.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v4.l, v1.l, v0.l
; GFX11-DL-TRUE16-NEXT: global_store_b16 v5, v0, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_endpgm
;
@@ -2438,38 +2438,39 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-LABEL: udot4_acc16_vecMul:
; GFX11-DL-TRUE16: ; %bb.0: ; %entry
; GFX11-DL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DL-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-TRUE16-NEXT: s_clause 0x1
; GFX11-DL-TRUE16-NEXT: global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v3, s[4:5]
+; GFX11-DL-TRUE16-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT: global_load_u16 v3, v2, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v4.h, 8, v1.l
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v5.h, 8, v2.l
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v5.h, 8, v0.l
; GFX11-DL-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v1.l
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v2.l
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v0.l
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v0
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.h
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v4, v4, v5
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v0.h
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v5.h, v6.l
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.h, v7.l
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, v0.l
-; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v1, v2
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, v3.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v1, v5
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v4.h
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.h
-; GFX11-DL-TRUE16-NEXT: global_store_b16 v3, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT: global_store_b16 v2, v0, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_endpgm
;
; GFX11-DL-FAKE16-LABEL: udot4_acc16_vecMul:
@@ -2713,44 +2714,46 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-LABEL: udot4_acc8_vecMul:
; GFX11-DL-TRUE16: ; %bb.0: ; %entry
; GFX11-DL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-TRUE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DL-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-TRUE16-NEXT: s_clause 0x1
-; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT: global_load_d16_u8 v0, v5, s[4:5]
+; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT: global_load_u8 v5, v4, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v3
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v4
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v0.h, 8, v3.l
-; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.h, 8, v4.l
-; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l
-; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v2.l, v2.l, v6.l
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v0.l, 8, v2.l
+; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v2.h, v3.h
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.l, 8, v3.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v6.l, v7.l
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.h
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v2.l
+; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.h
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.h
-; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v1.l, v2.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v1, v7, v6
+; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v0.h, v1.l
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
-; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.h, v4.h, v0.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v0, v7, v6
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v0
+; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v3.l, v5.l
+; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v6.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.h, v3.h, v0.l
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
-; GFX11-DL-TRUE16-NEXT: global_store_b8 v5, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT: global_store_b8 v4, v0, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_endpgm
;
; GFX11-DL-FAKE16-LABEL: udot4_acc8_vecMul:
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index fdcb033f4f4d6..3880acf2970ac 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -752,8 +752,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_half(half inre
; GISEL-GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; GISEL-GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
; GISEL-GFX11-TRUE16-NEXT: [[V_ADD_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_ADD_F16_t16_e64 0, [[COPY3]], 0, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
- ; GISEL-GFX11-TRUE16-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; GISEL-GFX11-TRUE16-NEXT: FLAT_STORE_SHORT_t16 [[COPY4]], [[V_ADD_F16_t16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; GISEL-GFX11-TRUE16-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F16_t16_e64_]]
+ ; GISEL-GFX11-TRUE16-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; GISEL-GFX11-TRUE16-NEXT: FLAT_STORE_SHORT [[COPY5]], [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; GISEL-GFX11-TRUE16-NEXT: S_ENDPGM 0
;
; GISEL-GFX11-FAKE16-LABEL: name: amdgpu_cs_chain_preserve_cc_half
@@ -792,7 +793,8 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_half(half inre
; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_ADD_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_ADD_F16_t16_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX11-WF32-TRUE16-NEXT: FLAT_STORE_SHORT_t16 killed [[COPY3]], killed [[V_ADD_F16_t16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F16_t16_e64_]]
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: FLAT_STORE_SHORT killed [[COPY3]], killed [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; DAGISEL-GFX11-WF32-TRUE16-NEXT: S_ENDPGM 0
;
; DAGISEL-GFX11-WF32-FAKE16-LABEL: name: amdgpu_cs_chain_preserve_cc_half
@@ -817,7 +819,8 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_half(half inre
; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_ADD_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_ADD_F16_t16_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX11-WF64-TRUE16-NEXT: FLAT_STORE_SHORT_t16 killed [[COPY3]], killed [[V_ADD_F16_t16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F16_t16_e64_]]
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: FLAT_STORE_SHORT killed [[COPY3]], killed [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; DAGISEL-GFX11-WF64-TRUE16-NEXT: S_ENDPGM 0
;
; DAGISEL-GFX11-WF64-FAKE16-LABEL: name: amdgpu_cs_chain_preserve_cc_half
@@ -1004,8 +1007,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
; GISEL-GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; GISEL-GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
; GISEL-GFX11-TRUE16-NEXT: [[V_ADD_NC_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_ADD_NC_U16_t16_e64 0, [[COPY3]], 0, [[COPY2]], 0, 0, implicit $exec
- ; GISEL-GFX11-TRUE16-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; GISEL-GFX11-TRUE16-NEXT: FLAT_STORE_SHORT_t16 [[COPY4]], [[V_ADD_NC_U16_t16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; GISEL-GFX11-TRUE16-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_NC_U16_t16_e64_]]
+ ; GISEL-GFX11-TRUE16-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; GISEL-GFX11-TRUE16-NEXT: FLAT_STORE_SHORT [[COPY5]], [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; GISEL-GFX11-TRUE16-NEXT: S_ENDPGM 0
;
; GISEL-GFX11-FAKE16-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
@@ -1044,7 +1048,8 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_ADD_NC_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_ADD_NC_U16_t16_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX11-WF32-TRUE16-NEXT: FLAT_STORE_SHORT_t16 killed [[COPY3]], killed [[V_ADD_NC_U16_t16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_NC_U16_t16_e64_]]
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: FLAT_STORE_SHORT killed [[COPY3]], killed [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; DAGISEL-GFX11-WF32-TRUE16-NEXT: S_ENDPGM 0
;
; DAGISEL-GFX11-WF32-FAKE16-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
@@ -1069,7 +1074,8 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_ADD_NC_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_ADD_NC_U16_t16_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX11-WF64-TRUE16-NEXT: FLAT_STORE_SHORT_t16 killed [[COPY3]], killed [[V_ADD_NC_U16_t16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_NC_U16_t16_e64_]]
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: FLAT_STORE_SHORT killed [[COPY3]], killed [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
; DAGISEL-GFX11-WF64-TRUE16-NEXT: S_ENDPGM 0
;
; DAGISEL-GFX11-WF64-FAKE16-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index 4419b8c6f9862..57db2c94ce908 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -13,7 +13,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
+; SDAG-GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[6:7]
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -51,14 +51,14 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp(
; SDAG-GFX11-TRUE16: ; %bb.0: ; %entry
; SDAG-GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s1
-; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s2
-; SDAG-GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s3
+; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s1
+; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s2
+; SDAG-GFX11-TRUE16-NEXT: scratch_load_u16 v2, off, s3
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-TRUE16-NEXT: v_dot2_bf16_bf16 v0.l, v1, v2, v0.l
+; SDAG-GFX11-TRUE16-NEXT: v_dot2_bf16_bf16 v0.l, v0, v1, v2.l
; SDAG-GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s0
; SDAG-GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
index 0194d25a99cdc..a62849feb9edc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+real-true-d16 < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16
declare half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a, <2 x half> %b, half %c)
@@ -12,7 +12,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7]
+; SDAG-GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[6:7]
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -76,14 +76,14 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp(
; SDAG-GFX11-TRUE16: ; %bb.0: ; %entry
; SDAG-GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s1
-; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v2, off, s2
-; SDAG-GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s3
+; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s1
+; SDAG-GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s2
+; SDAG-GFX11-TRUE16-NEXT: scratch_load_u16 v2, off, s3
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, v1, v2, v0.l
+; SDAG-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, v0, v1, v2.l
; SDAG-GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s0
; SDAG-GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 7d63e22d84b72..add30e944ed48 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -76,7 +76,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -102,7 +102,7 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index ba03115c51536..4e60b70f17abe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -76,7 +76,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -102,7 +102,7 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index a135b43bad0fe..3e431057c15bf 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -84,7 +84,7 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
@@ -760,77 +760,41 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; EG-NEXT: MOV * T2.X, literal.x,
; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
;
-; GFX12-TRUE16-LABEL: constant_load_v16i16_align2:
-; GFX12-TRUE16: ; %bb.0: ; %entry
-; GFX12-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: s_clause 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v3, v8, s[0:1] offset:12
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v7, v8, s[0:1] offset:28
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v6, v8, s[0:1] offset:24
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v8, s[0:1] offset:20
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v8, s[0:1] offset:16
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v2, v8, s[0:1] offset:8
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v8, s[0:1] offset:4
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v8, s[0:1]
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:30
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:26
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:22
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:18
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:6
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:2
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: s_clause 0x1
-; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[4:7], off
-; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[0:3], off
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: constant_load_v16i16_align2:
-; GFX12-FAKE16: ; %bb.0: ; %entry
-; GFX12-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v8, 0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: s_clause 0x7
-; GFX12-FAKE16-NEXT: global_load_u16 v3, v8, s[0:1] offset:12
-; GFX12-FAKE16-NEXT: global_load_u16 v7, v8, s[0:1] offset:28
-; GFX12-FAKE16-NEXT: global_load_u16 v6, v8, s[0:1] offset:24
-; GFX12-FAKE16-NEXT: global_load_u16 v5, v8, s[0:1] offset:20
-; GFX12-FAKE16-NEXT: global_load_u16 v4, v8, s[0:1] offset:16
-; GFX12-FAKE16-NEXT: global_load_u16 v2, v8, s[0:1] offset:8
-; GFX12-FAKE16-NEXT: global_load_u16 v1, v8, s[0:1] offset:4
-; GFX12-FAKE16-NEXT: global_load_u16 v0, v8, s[0:1]
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:30
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:26
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:22
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:18
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:6
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7
-; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:2
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: s_clause 0x1
-; GFX12-FAKE16-NEXT: global_store_b128 v[0:1], v[4:7], off
-; GFX12-FAKE16-NEXT: global_store_b128 v[0:1], v[0:3], off
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX12-LABEL: constant_load_v16i16_align2:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v8, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_clause 0x7
+; GFX12-NEXT: global_load_u16 v3, v8, s[0:1] offset:12
+; GFX12-NEXT: global_load_u16 v7, v8, s[0:1] offset:28
+; GFX12-NEXT: global_load_u16 v6, v8, s[0:1] offset:24
+; GFX12-NEXT: global_load_u16 v5, v8, s[0:1] offset:20
+; GFX12-NEXT: global_load_u16 v4, v8, s[0:1] offset:16
+; GFX12-NEXT: global_load_u16 v2, v8, s[0:1] offset:8
+; GFX12-NEXT: global_load_u16 v1, v8, s[0:1] offset:4
+; GFX12-NEXT: global_load_u16 v0, v8, s[0:1]
+; GFX12-NEXT: s_wait_loadcnt 0x7
+; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:14
+; GFX12-NEXT: s_wait_loadcnt 0x7
+; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:30
+; GFX12-NEXT: s_wait_loadcnt 0x7
+; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:26
+; GFX12-NEXT: s_wait_loadcnt 0x7
+; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:22
+; GFX12-NEXT: s_wait_loadcnt 0x7
+; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:18
+; GFX12-NEXT: s_wait_loadcnt 0x7
+; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:10
+; GFX12-NEXT: s_wait_loadcnt 0x7
+; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:6
+; GFX12-NEXT: s_wait_loadcnt 0x7
+; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off
+; GFX12-NEXT: global_store_b128 v[0:1], v[0:3], off
+; GFX12-NEXT: s_endpgm
entry:
%ld = load <16 x i16>, ptr addrspace(4) %ptr0, align 2
store <16 x i16> %ld, ptr addrspace(1) poison, align 32
@@ -5510,27 +5474,16 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
-; GFX12-TRUE16-LABEL: constant_zextload_i16_to_i64:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-TRUE16-NEXT: global_store_b64 v1, v[0:1], s[0:1]
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: constant_zextload_i16_to_i64:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: global_load_u16 v0, v1, s[2:3]
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-FAKE16-NEXT: global_store_b64 v1, v[0:1], s[0:1]
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX12-LABEL: constant_zextload_i16_to_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: s_endpgm
%a = load i16, ptr addrspace(4) %in
%ext = zext i16 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -5612,31 +5565,18 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
;
-; GFX12-TRUE16-LABEL: constant_sextload_i16_to_i64:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: constant_sextload_i16_to_i64:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: global_load_u16 v0, v2, s[2:3]
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX12-LABEL: constant_sextload_i16_to_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_endpgm
%a = load i16, ptr addrspace(4) %in
%ext = sext i16 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -5711,27 +5651,16 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
-; GFX12-TRUE16-LABEL: constant_zextload_v1i16_to_v1i64:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-TRUE16-NEXT: global_store_b64 v1, v[0:1], s[0:1]
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: constant_zextload_v1i16_to_v1i64:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: global_load_u16 v0, v1, s[2:3]
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-FAKE16-NEXT: global_store_b64 v1, v[0:1], s[0:1]
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX12-LABEL: constant_zextload_v1i16_to_v1i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: s_endpgm
%load = load <1 x i16>, ptr addrspace(4) %in
%ext = zext <1 x i16> %load to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -5808,31 +5737,18 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
;
-; GFX12-TRUE16-LABEL: constant_sextload_v1i16_to_v1i64:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: constant_sextload_v1i16_to_v1i64:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: global_load_u16 v0, v2, s[2:3]
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX12-LABEL: constant_sextload_v1i16_to_v1i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_endpgm
%load = load <1 x i16>, ptr addrspace(4) %in
%ext = sext <1 x i16> %load to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index b534c2c267fad..6f87decb099ab 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -85,7 +85,7 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_u8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_u8 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_store_b8 v1, v0, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
@@ -183,7 +183,7 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
@@ -944,33 +944,19 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45)
;
-; GFX12-TRUE16-LABEL: constant_zextload_v2i8_to_v2i32:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: constant_zextload_v2i8_to_v2i32:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: global_load_u16 v0, v2, s[2:3]
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX12-LABEL: constant_zextload_v2i8_to_v2i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_endpgm
%load = load <2 x i8>, ptr addrspace(4) %in
%ext = zext <2 x i8> %load to <2 x i32>
store <2 x i32> %ext, ptr addrspace(1) %out
@@ -5314,27 +5300,16 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
-; GFX12-TRUE16-LABEL: constant_zextload_i8_to_i64:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_u8 v0, v1, s[2:3]
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-TRUE16-NEXT: global_store_b64 v1, v[0:1], s[0:1]
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: constant_zextload_i8_to_i64:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: global_load_u8 v0, v1, s[2:3]
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-FAKE16-NEXT: global_store_b64 v1, v[0:1], s[0:1]
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX12-LABEL: constant_zextload_i8_to_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: s_endpgm
%a = load i8, ptr addrspace(4) %in
%ext = zext i8 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -5412,31 +5387,18 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
;
-; GFX12-TRUE16-LABEL: constant_sextload_i8_to_i64:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_i8 v0, v2, s[2:3]
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: constant_sextload_i8_to_i64:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: global_load_i8 v0, v2, s[2:3]
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX12-LABEL: constant_sextload_i8_to_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_i8 v0, v2, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_endpgm
%a = load i8, ptr addrspace(4) %in
%ext = sext i8 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -5596,31 +5558,18 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
;
-; GFX12-TRUE16-LABEL: constant_sextload_v1i8_to_v1i64:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_i8 v0, v2, s[2:3]
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: constant_sextload_v1i8_to_v1i64:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: global_load_i8 v0, v2, s[2:3]
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX12-LABEL: constant_sextload_v1i8_to_v1i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_i8 v0, v2, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_endpgm
%load = load <1 x i8>, ptr addrspace(4) %in
%ext = sext <1 x i8> %load to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -5717,33 +5666,19 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45)
;
-; GFX12-TRUE16-LABEL: constant_zextload_v2i8_to_v2i64:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xff, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2
-; GFX12-TRUE16-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: constant_zextload_v2i8_to_v2i64:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: global_load_u16 v0, v1, s[2:3]
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v0
-; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xff, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2
-; GFX12-FAKE16-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX12-LABEL: constant_zextload_v2i8_to_v2i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 8, v2
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_endpgm
%load = load <2 x i8>, ptr addrspace(4) %in
%ext = zext <2 x i8> %load to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -5843,39 +5778,22 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
; EG-NEXT: ASHR * T4.W, PV.Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
;
-; GFX12-TRUE16-LABEL: constant_sextload_v2i8_to_v2i64:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v4, s[2:3]
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GFX12-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_bfe_i32 v2, v1, 0, 8
-; GFX12-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-TRUE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: constant_sextload_v2i8_to_v2i64:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, 0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: global_load_u16 v0, v4, s[2:3]
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GFX12-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_bfe_i32 v2, v1, 0, 8
-; GFX12-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX12-LABEL: constant_sextload_v2i8_to_v2i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u16 v0, v4, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_endpgm
%load = load <2 x i8>, ptr addrspace(4) %in
%ext = sext <2 x i8> %load to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -9202,7 +9120,7 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_u8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_u8 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
@@ -9301,7 +9219,7 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_i8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_i8 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
@@ -9398,7 +9316,7 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_u8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_u8 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
@@ -9497,7 +9415,7 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_i8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT: global_load_i8 v0, v1, s[2:3]
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
@@ -9595,39 +9513,22 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
-; GFX12-TRUE16-LABEL: constant_zextload_v2i8_to_v2i16:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v0, s[2:3]
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v1
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX12-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: constant_zextload_v2i8_to_v2i16:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v1
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX12-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX12-LABEL: constant_zextload_v2i8_to_v2i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v1
+; GFX12-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 8, v2
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
%load = load <2 x i8>, ptr addrspace(4) %in
%ext = zext <2 x i8> %load to <2 x i16>
store <2 x i16> %ext, ptr addrspace(1) %out
@@ -9729,39 +9630,22 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
-; GFX12-TRUE16-LABEL: constant_sextload_v2i8_to_v2i16:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v0, s[2:3]
-; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_bfe_i32 v2, v1, 0, 16
-; GFX12-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX12-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-TRUE16-NEXT: s_endpgm
-;
-; GFX12-FAKE16-LABEL: constant_sextload_v2i8_to_v2i16:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT: v_bfe_i32 v2, v1, 0, 16
-; GFX12-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX12-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-FAKE16-NEXT: s_endpgm
+; GFX12-LABEL: constant_sextload_v2i8_to_v2i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 16
+; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 8, v2
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
%load = load <2 x i8>, ptr addrspace(4) %in
%ext = sext <2 x i8> %load to <2 x i16>
store <2 x i16> %ext, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index fbf8011fd40c9..3915ece69f366 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -69,18 +69,18 @@ define amdgpu_kernel void @mad_u16(
; GFX11-TRUE16-LABEL: mad_u16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v1, s[6:7] glc dlc
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v0, s[6:7] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v1.l
-; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: global_store_b16 v3, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: mad_u16:
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 311527d5d04cc..39d73a293647a 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -1458,10 +1458,10 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[4:5]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v1, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v2.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -2747,10 +2747,10 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_u8 v0, v1, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_hi_u8 v0, v1, s[4:5]
+; GFX11-TRUE16-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_u8 v2, v1, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v2.l
; GFX11-TRUE16-NEXT: global_store_b8 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -3176,47 +3176,26 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
; GFX10-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: v_test_umin_ult_i16_multi_use:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x0
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v0, s[6:7]
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v0, s[4:5]
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_lt_u32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-TRUE16-NEXT: global_store_b8 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: v_test_umin_ult_i16_multi_use:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x0
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v1
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v2
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cmp_lt_u32_e32 vcc_lo, v4, v3
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT: global_store_b8 v0, v2, s[2:3]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: v_test_umin_ult_i16_multi_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-NEXT: global_load_u16 v2, v0, s[4:5]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: s_endpgm
;
; GFX1250-LABEL: v_test_umin_ult_i16_multi_use:
; GFX1250: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
index 3768634c1691c..86f9f305a30d1 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
@@ -9,13 +9,11 @@ define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[V_EXP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_EXP_F16_t16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
- ; CHECK-NEXT: [[V_EXP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_EXP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_EXP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_EXP_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
@@ -32,13 +30,11 @@ define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[V_LOG_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_LOG_F16_t16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
- ; CHECK-NEXT: [[V_LOG_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_LOG_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_LOG_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_LOG_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
@@ -55,13 +51,11 @@ define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[V_RCP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_RCP_F16_t16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
- ; CHECK-NEXT: [[V_RCP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_RCP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RCP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RCP_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
@@ -78,13 +72,11 @@ define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[V_RSQ_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_RSQ_F16_t16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
- ; CHECK-NEXT: [[V_RSQ_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_RSQ_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RSQ_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RSQ_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
@@ -101,13 +93,11 @@ define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[V_SQRT_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_SQRT_F16_t16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]].lo16, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
- ; CHECK-NEXT: [[V_SQRT_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_SQRT_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_SQRT_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_SQRT_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index a3c38b17abf00..f55245906c381 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -32,59 +32,23 @@ define i8 @flat_inst_valu_offset_1(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:1
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:1
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:1
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:1
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:1
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_inst_valu_offset_1:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:1
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 1
%load = load i8, ptr %gep, align 4
ret i8 %load
@@ -107,59 +71,23 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:2047
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:2047
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:2047
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:2047
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_11bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:2047
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_inst_valu_offset_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:2047
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_11bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2047
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 2047
%load = load i8, ptr %gep, align 4
ret i8 %load
@@ -182,59 +110,23 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_12bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_inst_valu_offset_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:4095
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_12bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 4095
%load = load i8, ptr %gep, align 4
ret i8 %load
@@ -259,47 +151,26 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:8191
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:8191
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_13bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
; GFX9-GISEL: ; %bb.0:
@@ -319,17 +190,6 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:8191
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 8191
%load = load i8, ptr %gep, align 4
ret i8 %load
@@ -354,47 +214,26 @@ define i8 @flat_inst_valu_offset_24bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_24bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_24bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_24bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:8388607
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_24bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:8388607
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_24bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_24bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8388607
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_24bit_max:
; GFX9-GISEL: ; %bb.0:
@@ -414,17 +253,6 @@ define i8 @flat_inst_valu_offset_24bit_max(ptr %p) {
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_inst_valu_offset_24bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:8388607
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 8388607
%load = load i8, ptr %gep, align 4
ret i8 %load
@@ -449,68 +277,26 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-2048
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-2048
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_11bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_inst_valu_offset_neg_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-2048
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 -2048
%load = load i8, ptr %gep, align 4
ret i8 %load
@@ -535,68 +321,26 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-4096
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-4096
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_12bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_inst_valu_offset_neg_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-4096
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 -4096
%load = load i8, ptr %gep, align 4
ret i8 %load
@@ -621,68 +365,26 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8192
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8192
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-8192
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 -8192
%load = load i8, ptr %gep, align 4
ret i8 %load
@@ -707,68 +409,26 @@ define i8 @flat_inst_valu_offset_neg_24bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_24bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_24bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_24bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8388608
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_24bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8388608
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_24bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_inst_valu_offset_neg_24bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-8388608
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_neg_24bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_neg_24bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8388608
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 -8388608
%load = load i8, ptr %gep, align 4
ret i8 %load
@@ -792,59 +452,23 @@ define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_11bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:4095
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 4095
%load = load i8, ptr %gep, align 4
ret i8 %load
@@ -869,47 +493,26 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:8191
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:8191
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_2x_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
; GFX9-GISEL: ; %bb.0:
@@ -929,17 +532,6 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:8191
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 8191
%load = load i8, ptr %gep, align 4
ret i8 %load
@@ -964,47 +556,26 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:16383
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:16383
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_2x_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
; GFX9-GISEL: ; %bb.0:
@@ -1024,17 +595,6 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:16383
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 16383
%load = load i8, ptr %gep, align 4
ret i8 %load
@@ -1059,53 +619,29 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_24bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4094
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_24bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4094
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_24bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:8388606
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_24bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:8388606
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_24bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4094
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_2x_24bit_max:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388606
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max:
; GFX9-GISEL: ; %bb.0:
@@ -1163,68 +699,26 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-4096
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-4096
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-4096
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 -4096
%load = load i8, ptr %gep, align 4
ret i8 %load
@@ -1249,68 +743,26 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8192
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8192
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-8192
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 -8192
%load = load i8, ptr %gep, align 4
ret i8 %load
@@ -1335,68 +787,26 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-16384
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-16384
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-16384
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 -16384
%load = load i8, ptr %gep, align 4
ret i8 %load
@@ -1421,63 +831,29 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8388607
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8388607
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xff000001, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8388607
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
; GFX12-GISEL: ; %bb.0:
@@ -1517,53 +893,29 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:2047
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:2047
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:2047
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:2047
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
; GFX9-GISEL: ; %bb.0:
@@ -1622,53 +974,29 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:2048
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:2048
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:2048
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:2048
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
; GFX9-GISEL: ; %bb.0:
@@ -1727,53 +1055,29 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
; GFX9-GISEL: ; %bb.0:
@@ -1832,63 +1136,29 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4096
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4096
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
; GFX12-GISEL: ; %bb.0:
@@ -1928,53 +1198,29 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:8191
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:8191
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
; GFX9-GISEL: ; %bb.0:
@@ -2033,63 +1279,29 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:8192
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:8192
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
; GFX12-GISEL: ; %bb.0:
@@ -2130,53 +1342,29 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8386561
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
; GFX9-GISEL: ; %bb.0:
@@ -2188,16 +1376,6 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2237,53 +1415,29 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8386560
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
; GFX9-GISEL: ; %bb.0:
@@ -2295,16 +1449,6 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2344,53 +1488,29 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8384513
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
; GFX9-GISEL: ; %bb.0:
@@ -2402,16 +1522,6 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2451,53 +1561,29 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8384512
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
; GFX9-GISEL: ; %bb.0:
@@ -2509,16 +1595,6 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2558,53 +1634,29 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8380417
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
; GFX9-GISEL: ; %bb.0:
@@ -2616,16 +1668,6 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2665,53 +1707,29 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8380416
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: flat_load_u8 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
; GFX9-GISEL: ; %bb.0:
@@ -2723,16 +1741,6 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2776,65 +1784,25 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:1 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:1 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX11-GISEL-LABEL: flat_inst_salu_offset_1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:1 glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: flat_inst_salu_offset_1:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-GISEL-NEXT: s_endpgm
+; GFX11-LABEL: flat_inst_salu_offset_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:1 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr %p, i64 1
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
@@ -2866,65 +1834,25 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:2047 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX11-GISEL-LABEL: flat_inst_salu_offset_11bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: flat_inst_salu_offset_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-GISEL-NEXT: s_endpgm
+; GFX11-LABEL: flat_inst_salu_offset_11bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr %p, i64 2047
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
@@ -2956,65 +1884,25 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX11-GISEL-LABEL: flat_inst_salu_offset_12bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: flat_inst_salu_offset_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-GISEL-NEXT: s_endpgm
+; GFX11-LABEL: flat_inst_salu_offset_12bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr %p, i64 4095
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
@@ -3048,49 +1936,27 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_13bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_13bit_max:
; GFX9-GISEL: ; %bb.0:
@@ -3117,16 +1983,6 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: flat_inst_salu_offset_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-GISEL-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr %p, i64 8191
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
@@ -3160,49 +2016,27 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0xfffff800, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_neg_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0xfffff800, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_neg_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff800, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_neg_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX9-GISEL: ; %bb.0:
@@ -3229,16 +2063,6 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-GISEL-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr %p, i64 -2048
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
@@ -3272,49 +2096,27 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_neg_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_neg_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_neg_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX9-GISEL: ; %bb.0:
@@ -3341,16 +2143,6 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-GISEL-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr %p, i64 -4096
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
@@ -3384,49 +2176,27 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_neg_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_neg_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_neg_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX9-GISEL: ; %bb.0:
@@ -3453,16 +2223,6 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-GISEL-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr %p, i64 -8192
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
@@ -3494,65 +2254,25 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_11bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: flat_inst_salu_offset_2x_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-GISEL-NEXT: s_endpgm
+; GFX11-LABEL: flat_inst_salu_offset_2x_11bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_2x_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr %p, i64 4095
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
@@ -3586,49 +2306,27 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_2x_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX9-GISEL: ; %bb.0:
@@ -3655,16 +2353,6 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-GISEL-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr %p, i64 8191
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
@@ -3698,49 +2386,27 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0x3000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0x3000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x3000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_2x_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX9-GISEL: ; %bb.0:
@@ -3767,16 +2433,6 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-GISEL-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr %p, i64 16383
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
@@ -3810,49 +2466,27 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX9-GISEL: ; %bb.0:
@@ -3879,16 +2513,6 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-GISEL-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr %p, i64 -4096
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
@@ -3922,49 +2546,27 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX9-GISEL: ; %bb.0:
@@ -3991,16 +2593,6 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-GISEL-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr %p, i64 -8192
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
@@ -4034,49 +2626,27 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX9-GISEL: ; %bb.0:
@@ -4103,16 +2673,6 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-GISEL-NEXT: s_endpgm
%gep = getelementptr inbounds i8, ptr %p, i64 -16384
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
@@ -4146,53 +2706,29 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:2047 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX9-GISEL: ; %bb.0:
@@ -4265,53 +2801,29 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:2048 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:2048 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:2048 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:2048 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX9-GISEL: ; %bb.0:
@@ -4384,53 +2896,29 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX9-GISEL: ; %bb.0:
@@ -4504,53 +2992,29 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4096 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4096 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX9-GISEL: ; %bb.0:
@@ -4624,53 +3088,29 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX9-GISEL: ; %bb.0:
@@ -4744,53 +3184,29 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:8192 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:8192 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX9-GISEL: ; %bb.0:
@@ -4865,57 +3281,31 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8386561 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX9-GISEL: ; %bb.0:
@@ -4990,57 +3380,31 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8386560 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX9-GISEL: ; %bb.0:
@@ -5115,57 +3479,31 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8384513 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX9-GISEL: ; %bb.0:
@@ -5240,57 +3578,31 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8384512 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX9-GISEL: ; %bb.0:
@@ -5365,57 +3677,31 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8380417 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX9-GISEL: ; %bb.0:
@@ -5490,57 +3776,31 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
; GFX10-NEXT: flat_store_byte v[0:1], v0
; GFX10-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:-8380416 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: flat_store_b8 v[0:1], v0
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX9-GISEL: ; %bb.0:
@@ -5588,11 +3848,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX10-GISEL: {{.*}}
; GFX10-SDAG: {{.*}}
-; GFX11: {{.*}}
; GFX11-GISEL-FAKE16: {{.*}}
; GFX11-GISEL-TRUE16: {{.*}}
-; GFX11-SDAG: {{.*}}
-; GFX12: {{.*}}
+; GFX11-SDAG-FAKE16: {{.*}}
+; GFX11-SDAG-TRUE16: {{.*}}
; GFX12-GISEL-FAKE16: {{.*}}
; GFX12-GISEL-TRUE16: {{.*}}
-; GFX12-SDAG: {{.*}}
+; GFX12-SDAG-FAKE16: {{.*}}
+; GFX12-SDAG-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index 20916a9a51d9e..3c9cc1c9fc8c1 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -30,59 +30,23 @@ define i8 @global_inst_valu_offset_1(ptr addrspace(1) %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:1
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_inst_valu_offset_1:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:1
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:1
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:1
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:1
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:1
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 1
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -103,59 +67,23 @@ define i8 @global_inst_valu_offset_11bit_max(ptr addrspace(1) %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_11bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:2047
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_inst_valu_offset_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:2047
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:2047
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:2047
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:2047
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:2047
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_11bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2047
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:2047
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 2047
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -178,23 +106,23 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) {
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_12bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_inst_valu_offset_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_12bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:4095
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: global_inst_valu_offset_12bit_max:
; GFX10-SDAG: ; %bb.0:
@@ -204,42 +132,6 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) {
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -274,16 +166,16 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-GISEL-LABEL: global_inst_valu_offset_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:8191
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX12-LABEL: global_inst_valu_offset_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:8191
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_13bit_max:
; GFX9-SDAG: ; %bb.0:
@@ -303,47 +195,15 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:8191
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:8191
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_13bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -378,16 +238,16 @@ define i8 @global_inst_valu_offset_24bit_max(ptr addrspace(1) %p) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-GISEL-LABEL: global_inst_valu_offset_24bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:8388607
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX12-LABEL: global_inst_valu_offset_24bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:8388607
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_24bit_max:
; GFX9-SDAG: ; %bb.0:
@@ -407,47 +267,15 @@ define i8 @global_inst_valu_offset_24bit_max(ptr addrspace(1) %p) {
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_24bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_24bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_24bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:8388607
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_24bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:8388607
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_24bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8388607
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -468,59 +296,23 @@ define i8 @global_inst_valu_offset_neg_11bit_max(ptr addrspace(1) %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_neg_11bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:-2048
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_inst_valu_offset_neg_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:-2048
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-2048
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-2048
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-2048
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-2048
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_neg_11bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-2048
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_neg_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-2048
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -543,59 +335,23 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_neg_12bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_inst_valu_offset_neg_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-4096
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-4096
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_neg_12bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_neg_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -620,68 +376,26 @@ define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:-8192
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-8192
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-8192
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-8192
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -706,68 +420,26 @@ define i8 @global_inst_valu_offset_neg_24bit_max(ptr addrspace(1) %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_neg_24bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_inst_valu_offset_neg_24bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:-8388608
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_24bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_24bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_24bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-8388608
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_24bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-8388608
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_neg_24bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_neg_24bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-8388608
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -8388608
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -790,23 +462,23 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_11bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_2x_11bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_2x_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:4095
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_11bit_max:
; GFX10-SDAG: ; %bb.0:
@@ -816,42 +488,6 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -886,16 +522,16 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:8191
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX12-LABEL: global_inst_valu_offset_2x_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:8191
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_12bit_max:
; GFX9-SDAG: ; %bb.0:
@@ -915,47 +551,15 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:8191
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:8191
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_12bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -990,16 +594,16 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:16383
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX12-LABEL: global_inst_valu_offset_2x_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:16383
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_13bit_max:
; GFX9-SDAG: ; %bb.0:
@@ -1019,47 +623,15 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:16383
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:16383
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_13bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 16383
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -1126,53 +698,29 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) {
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_24bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4094
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_24bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4094
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_24bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:8388606
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_24bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:8388606
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4094
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8388606
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 16777214
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -1195,59 +743,23 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-4096
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-4096
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -1272,68 +784,26 @@ define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:-8192
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-8192
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-8192
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-8192
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -1358,68 +828,26 @@ define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off offset:-16384
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-16384
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-16384
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-16384
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -1486,53 +914,29 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) {
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xff001000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xff001000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-8388607
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-8388607
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff001000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-4095
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8388607
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -16777215
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -1601,53 +1005,29 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:2047
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:2047
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:2047
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:2047
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -1706,53 +1086,29 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:2048
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:2048
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:2048
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:2048
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -1820,53 +1176,29 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -1892,15 +1224,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
; GFX12-GISEL: ; %bb.0:
@@ -1916,53 +1248,19 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4096
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4096
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -2030,53 +1328,29 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:8191
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:8191
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -2102,15 +1376,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
; GFX12-GISEL: ; %bb.0:
@@ -2126,53 +1400,19 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:8192
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:8192
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -2242,53 +1482,29 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-2049
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-2049
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-8386561
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-8386561
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-2049
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386561
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -2349,53 +1565,29 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-2048
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-2048
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-8386560
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-8386560
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-2048
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386560
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -2465,53 +1657,29 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-1
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-1
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-8384513
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-8384513
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-1
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384513
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -2538,15 +1706,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
@@ -2572,53 +1740,19 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-8384512
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-8384512
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384512
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -2688,53 +1822,29 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-1
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-1
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-8380417
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-8380417
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-1
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380417
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -2761,15 +1871,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
@@ -2795,53 +1905,19 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:-8380416
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:-8380416
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380416
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616
%load = load i8, ptr addrspace(1) %gep, align 4
ret i8 %load
@@ -2868,65 +1944,25 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: global_inst_salu_offset_1:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT: s_endpgm
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:1 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 1
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -2954,65 +1990,25 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_11bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: global_inst_salu_offset_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT: s_endpgm
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:2047 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_11bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 2047
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -3040,65 +2036,25 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_12bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: global_inst_salu_offset_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT: s_endpgm
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_12bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -3126,65 +2082,25 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_13bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x1000
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: global_inst_salu_offset_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT: s_endpgm
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0x1000
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0x1000
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_13bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -3212,65 +2128,25 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_neg_11bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: global_inst_salu_offset_neg_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT: s_endpgm
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:-2048 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_neg_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_neg_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_neg_11bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_neg_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -3301,25 +2177,25 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
; GFX10-GISEL-NEXT: global_store_byte v[0:1], v0, off
; GFX10-GISEL-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_neg_12bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: global_inst_salu_offset_neg_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_neg_12bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_neg_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX10-SDAG: ; %bb.0:
@@ -3331,46 +2207,6 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
; GFX10-SDAG-NEXT: s_endpgm
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:-4096 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_neg_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_neg_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -3416,15 +2252,15 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT: s_endpgm
+; GFX12-LABEL: global_inst_salu_offset_neg_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX10-SDAG: ; %bb.0:
@@ -3437,49 +2273,17 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_neg_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_neg_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -3507,65 +2311,25 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_11bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT: s_endpgm
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_2x_11bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_2x_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -3593,65 +2357,25 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_12bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x1000
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT: s_endpgm
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0x1000
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0x1000
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_2x_12bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_2x_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -3679,65 +2403,25 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_13bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x3000
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT: s_endpgm
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3000
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0x3000
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_2x_13bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0x3000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_2x_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 16383
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -3768,25 +2452,25 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
; GFX10-GISEL-NEXT: global_store_byte v[0:1], v0, off
; GFX10-GISEL-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX10-SDAG: ; %bb.0:
@@ -3798,46 +2482,6 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
; GFX10-SDAG-NEXT: s_endpgm
-;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:-4096 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -3883,15 +2527,15 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT: s_endpgm
+; GFX12-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX10-SDAG: ; %bb.0:
@@ -3904,49 +2548,17 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -3992,15 +2604,15 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-GISEL-NEXT: s_endpgm
+; GFX12-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX10-SDAG: ; %bb.0:
@@ -4013,49 +2625,17 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -4126,53 +2706,29 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:2047 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:2047 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:2047 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:2047 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -4243,53 +2799,29 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:2048 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:2048 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:2048 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:2048 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -4360,53 +2892,29 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4095 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4095 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -4477,53 +2985,29 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4096 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4096 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -4594,53 +3078,29 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:4095 glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:8191 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:8191 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -4711,53 +3171,29 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
; GFX10-SDAG-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v[0:1], off offset:8192 scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v[0:1], off offset:8192 scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -4790,17 +3226,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s0, s0, 0x7ff
+; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
@@ -4814,55 +3250,18 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX11-SDAG-TRUE16-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX11-SDAG-FAKE16-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_movk_i32 s2, 0x7ff
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_brev_b32 s3, 1
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_movk_i32 s2, 0x7ff
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_brev_b32 s3, 1
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x7ff
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -4895,17 +3294,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s0, s0, 0x800
+; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
@@ -4919,55 +3318,18 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_add_u32 s0, s0, 0x800
-; GFX11-SDAG-TRUE16-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_add_u32 s0, s0, 0x800
-; GFX11-SDAG-FAKE16-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_movk_i32 s2, 0x800
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_brev_b32 s3, 1
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_movk_i32 s2, 0x800
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_brev_b32 s3, 1
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x800
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -5000,17 +3362,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s0, s0, 0xfff
+; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
@@ -5024,55 +3386,18 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX11-SDAG-TRUE16-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX11-SDAG-FAKE16-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_movk_i32 s2, 0xfff
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_brev_b32 s3, 1
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_movk_i32 s2, 0xfff
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_brev_b32 s3, 1
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_movk_i32 s2, 0xfff
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -5105,17 +3430,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s0, s0, 0x1000
+; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
@@ -5129,55 +3454,18 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX11-SDAG-TRUE16-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX11-SDAG-FAKE16-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_movk_i32 s2, 0x1000
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_brev_b32 s3, 1
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_movk_i32 s2, 0x1000
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_brev_b32 s3, 1
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1000
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -5210,17 +3498,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s0, s0, 0x1fff
+; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
@@ -5234,55 +3522,18 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-SDAG-TRUE16-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-SDAG-FAKE16-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_movk_i32 s2, 0x1fff
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_brev_b32 s3, 1
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_movk_i32 s2, 0x1fff
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_brev_b32 s3, 1
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1fff
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
@@ -5315,17 +3566,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
-; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_endpgm
+; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_add_u32 s0, s0, 0x2000
+; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b8 v[0:1], v0, off
+; GFX11-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
@@ -5339,66 +3590,29 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX11-SDAG-TRUE16-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX11-SDAG-FAKE16-NEXT: s_addc_u32 s1, s1, 0x80000000
-; GFX11-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX12-SDAG-TRUE16: ; %bb.0:
-; GFX12-SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT: s_movk_i32 s2, 0x2000
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-TRUE16-NEXT: s_brev_b32 s3, 1
-; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-TRUE16-NEXT: global_load_d16_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX12-SDAG-FAKE16: ; %bb.0:
-; GFX12-SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-FAKE16-NEXT: s_movk_i32 s2, 0x2000
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-FAKE16-NEXT: s_brev_b32 s3, 1
-; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-FAKE16-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-FAKE16-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x2000
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_endpgm
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616
%load = load volatile i8, ptr addrspace(1) %gep, align 1
store i8 %load, ptr addrspace(1) poison
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX11: {{.*}}
; GFX11-GISEL-FAKE16: {{.*}}
; GFX11-GISEL-TRUE16: {{.*}}
-; GFX11-SDAG: {{.*}}
-; GFX12: {{.*}}
+; GFX11-SDAG-FAKE16: {{.*}}
+; GFX11-SDAG-TRUE16: {{.*}}
; GFX12-GISEL-FAKE16: {{.*}}
; GFX12-GISEL-TRUE16: {{.*}}
-; GFX12-SDAG: {{.*}}
+; GFX12-SDAG-FAKE16: {{.*}}
+; GFX12-SDAG-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index b1e05158b6212..1419529644cfd 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -2641,7 +2641,7 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1]
+; GFX11-TRUE16-NEXT: flat_load_u8 v0, v[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
; GFX11-TRUE16-NEXT: .LBB8_1: ; %branch
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 25020673bce22..638cd0d0c5181 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -374,15 +374,15 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
; GFX11-TRUE16-LABEL: test_rotl_i16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off offset:48
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off offset:32
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v[2:3], off offset:48
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v[0:1], off offset:32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v2.l
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, 0, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, v2.l, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v[4:5], v0, off offset:8
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index 74ac181c120b5..042b9f7c85d45 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -331,15 +331,15 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
; GFX11-TRUE16-LABEL: test_rotr_i16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off offset:48
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off offset:32
+; GFX11-TRUE16-NEXT: global_load_u16 v2, v[2:3], off offset:48
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v[0:1], off offset:32
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v2.l
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, 0, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, v2.l, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v[4:5], v0, off offset:8
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 91c88ec5e718c..0aa8cee2df70e 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -9,8 +9,8 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+real-true-d16 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+real-true-d16 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; Test that add/sub with a constant is swapped to sub/add with negated
; constant to minimize code size.
@@ -1340,7 +1340,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3]
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 64
; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
@@ -1522,16 +1522,16 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
; GFX11-SDAG-TRUE16-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
+; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v1, v1, s[2:3]
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 64
-; GFX11-SDAG-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v1.l, v1.l, 64
+; GFX11-SDAG-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-TRUE16-NEXT: s_endpgm
;
; GFX11-SDAG-FAKE16-LABEL: v_test_i16_x_sub_64_zext_to_i32:
@@ -1759,12 +1759,12 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[2:3] glc dlc
+; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] glc dlc
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 64
-; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.h, v0.h, 64
+; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.h, v2.l, 64
; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] dlc
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1] dlc
diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll
index a9fb77904c641..496d8082d3a06 100644
--- a/llvm/test/CodeGen/AMDGPU/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/smed3.ll
@@ -2,7 +2,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16,+real-true-d16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll
index 3d21860e2af40..9708a359360b3 100644
--- a/llvm/test/CodeGen/AMDGPU/spillv16.ll
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll
@@ -6,7 +6,7 @@ define void @spill_i16_alu() {
; GCN-TRUE16-LABEL: spill_i16_alu:
; GCN-TRUE16: ; %bb.0: ; %entry
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
@@ -52,13 +52,13 @@ define void @spill_i16_alu_two_vals() {
; GCN-TRUE16-LABEL: spill_i16_alu_two_vals:
; GCN-TRUE16: ; %bb.0: ; %entry
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill
; GCN-TRUE16-NEXT: ;;#ASMSTART
; GCN-TRUE16-NEXT: ;;#ASMEND
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc
+; GCN-TRUE16-NEXT: scratch_load_u16 v0, off, s32 offset:4 glc dlc
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GCN-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload
; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
@@ -113,33 +113,19 @@ entry:
; Tests after this do not actually test 16 bit spills because there is no use of VGPR_16. They could demonstrate 16 bit spills if we update the instructions to use VGPR_16 instead of VGPR_32
define void @spill_i16() {
-; GCN-TRUE16-LABEL: spill_i16:
-; GCN-TRUE16: ; %bb.0: ; %entry
-; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
-; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT: ;;#ASMSTART
-; GCN-TRUE16-NEXT: ;;#ASMEND
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
-; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN-FAKE16-LABEL: spill_i16:
-; GCN-FAKE16: ; %bb.0: ; %entry
-; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
-; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
-; GCN-FAKE16-NEXT: ;;#ASMSTART
-; GCN-FAKE16-NEXT: ;;#ASMEND
-; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
-; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
-; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: spill_i16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
%alloca = alloca i16, i32 1, align 4, addrspace(5)
@@ -156,33 +142,19 @@ entry:
}
define void @spill_half() {
-; GCN-TRUE16-LABEL: spill_half:
-; GCN-TRUE16: ; %bb.0: ; %entry
-; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
-; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT: ;;#ASMSTART
-; GCN-TRUE16-NEXT: ;;#ASMEND
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
-; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN-FAKE16-LABEL: spill_half:
-; GCN-FAKE16: ; %bb.0: ; %entry
-; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
-; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
-; GCN-FAKE16-NEXT: ;;#ASMSTART
-; GCN-FAKE16-NEXT: ;;#ASMEND
-; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
-; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
-; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: spill_half:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
%alloca = alloca half, i32 1, align 4, addrspace(5)
@@ -199,33 +171,19 @@ entry:
}
define void @spill_i16_from_v2i16() {
-; GCN-TRUE16-LABEL: spill_i16_from_v2i16:
-; GCN-TRUE16: ; %bb.0: ; %entry
-; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
-; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT: ;;#ASMSTART
-; GCN-TRUE16-NEXT: ;;#ASMEND
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
-; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN-FAKE16-LABEL: spill_i16_from_v2i16:
-; GCN-FAKE16: ; %bb.0: ; %entry
-; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 glc dlc
-; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
-; GCN-FAKE16-NEXT: ;;#ASMSTART
-; GCN-FAKE16-NEXT: ;;#ASMEND
-; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
-; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
-; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: spill_i16_from_v2i16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_u16 v0, off, s32 offset:2 glc dlc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
%alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
@@ -245,19 +203,19 @@ define void @spill_2xi16_from_v2i16() {
; GCN-TRUE16-LABEL: spill_2xi16_from_v2i16:
; GCN-TRUE16: ; %bb.0: ; %entry
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GCN-TRUE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 glc dlc
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:12 ; 4-byte Folded Spill
+; GCN-TRUE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
; GCN-TRUE16-NEXT: ;;#ASMSTART
; GCN-TRUE16-NEXT: ;;#ASMEND
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:12 ; 4-byte Folded Reload
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
@@ -306,19 +264,17 @@ define void @spill_2xi16_from_v2i16_one_free_reg() {
; GCN-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
; GCN-TRUE16: ; %bb.0: ; %entry
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GCN-TRUE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 glc dlc
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
; GCN-TRUE16-NEXT: ;;#ASMSTART
; GCN-TRUE16-NEXT: ;;#ASMEND
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
-; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.l
; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
index 40aac82888de2..aed966e5728dd 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
@@ -506,7 +506,7 @@ define float @v_constrained_fpext_f16_to_f32_noabi(ptr addrspace(1) %ptr) #0 {
; GFX11-TRUE16-LABEL: v_constrained_fpext_f16_to_f32_noabi:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 5c113d80a9c80..13974da7a6043 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -437,16 +437,16 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[2:3] offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: global_load_u16 v0, v0, s[2:3] offset:2 scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h
-; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT: v_sub_nc_u16 v0.l, v1.l, v0.l
+; GFX12-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
;
; GFX12-FAKE16-LABEL: test_sub_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index e1574dcd45462..93ffa6613b363 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -739,7 +739,7 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_d16_b16 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_u16 v1, v0, s[4:5]
; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_u32_e32 v2, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll
index 9d8a45ada87aa..5a5abdd6bdbfd 100644
--- a/llvm/test/CodeGen/AMDGPU/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/umed3.ll
@@ -2,7 +2,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16,+real-true-d16 < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 0289dab4588a2..ba441a0029b51 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -738,7 +738,7 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_d16_b16 v1, v0, s[4:5]
+; GFX11-NEXT: global_load_u16 v1, v0, s[4:5]
; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u32_e32 v2, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index b314cf2e1d9cc..c55a18ff1db02 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -2232,11 +2232,10 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_u16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -2244,10 +2243,11 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-TRUE16-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l
-; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, s[2:3]
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l
+; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, s[2:3]
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -2278,20 +2278,20 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[0:1]
+; GFX12-TRUE16-NEXT: global_load_u16 v1, v0, s[0:1]
; GFX12-TRUE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX12-TRUE16-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l
-; GFX12-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, s[2:3]
+; GFX12-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l
+; GFX12-TRUE16-NEXT: v_xor_b16 v0.h, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, s[2:3]
; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX12-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index d8044139aceb3..f093f055c9612 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -4,7 +4,7 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-FAKE16 %s
; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-REAL16 %s
-; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-REAL16 %s
+; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16,+real-true-d16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-REAL16 %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
@@ -88,14 +88,14 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GCN-REAL16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT: global_load_d16_hi_b16 v0, v1, s[2:3] glc dlc
+; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l
-; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h
+; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v1.l
+; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l
; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART
@@ -217,14 +217,14 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GCN-REAL16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT: global_load_d16_hi_b16 v0, v1, s[2:3] glc dlc
+; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT: v_subrev_f16_e32 v0.l, 2.0, v0.l
-; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h
+; GFX11-GCN-REAL16-NEXT: v_subrev_f16_e32 v0.l, 2.0, v1.l
+; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l
; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART
@@ -460,14 +460,14 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GCN-REAL16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT: global_load_d16_hi_b16 v0, v1, s[2:3] glc dlc
+; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l
-; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h
+; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v1.l
+; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l
; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, |v0.l|, |v0.h|
; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART
@@ -591,14 +591,14 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(
; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GCN-REAL16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT: global_load_d16_hi_b16 v0, v1, s[2:3] glc dlc
+; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l
-; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h
+; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v1.l
+; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l
; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, -v0.l, -v0.h
; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
index 587f5d05d358b..7ddf7cac05a8a 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
@@ -26,7 +26,7 @@ define <2 x i8> @shuffle_v2i8_rebroadcast(ptr addrspace(1) %arg0) {
; GFX11-TRUE16-LABEL: shuffle_v2i8_rebroadcast:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b16 v1.l, 8, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 11d724eda547e..babd6edc1cc44 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -3017,23 +3017,14 @@ define void @shuffle_v4i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1,
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: shuffle_v4i8_concat:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
-; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: global_store_b32 v[4:5], v0, off
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: shuffle_v4i8_concat:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: global_store_b32 v[4:5], v0, off
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: shuffle_v4i8_concat:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b32 v[4:5], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%val0 = load <2 x i8>, ptr addrspace(1) %arg0
%val1 = load <2 x i8>, ptr addrspace(1) %arg1
%shuffle = shufflevector <2 x i8> %val0, <2 x i8> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 04a5cac116d78..0e662850478be 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -398,7 +398,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_u16 v0, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x3e7, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, 4
More information about the llvm-commits
mailing list